gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   if (dump_enabled_p ())
 217     {
 218       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 219       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 220     }
 221   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 222     return false;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 228
 229       /* If a pattern statement has def stmts, analyze them too.  */
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             {
 237               dump_printf_loc (MSG_NOTE, vect_location,
 238                                "==> examining pattern def stmt: ");
 239               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 240                                 def_stmt_info->stmt, 0);
 241             }
 242           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 243                                              vf, mask_producers))
 244             return false;
 245         }
 246
 247       if (dump_enabled_p ())
 248         {
 249           dump_printf_loc (MSG_NOTE, vect_location,
 250                            "==> examining pattern statement: ");
 251           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 252         }
 253       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 254         return false;
 255     }
 256
 257   return true;
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static bool
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297   auto_vec<stmt_vec_info> mask_producers;
 298
 299   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 300
 301   for (i = 0; i < nbbs; i++)
 302     {
 303       basic_block bb = bbs[i];
 304
 305       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 306            gsi_next (&si))
 307         {
 308           phi = si.phi ();
 309           stmt_info = vinfo_for_stmt (phi);
 310           if (dump_enabled_p ())
 311             {
 312               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 313               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 314             }
 315
 316           gcc_assert (stmt_info);
 317
 318           if (STMT_VINFO_RELEVANT_P (stmt_info)
 319               || STMT_VINFO_LIVE_P (stmt_info))
 320             {
 321               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 322               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 323
 324               if (dump_enabled_p ())
 325                 {
 326                   dump_printf_loc (MSG_NOTE, vect_location,
 327                                    "get vectype for scalar type:  ");
 328                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 329                   dump_printf (MSG_NOTE, "\n");
 330                 }
 331
 332               vectype = get_vectype_for_scalar_type (scalar_type);
 333               if (!vectype)
 334                 {
 335                   if (dump_enabled_p ())
 336                     {
 337                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 338                                        "not vectorized: unsupported "
 339                                        "data-type ");
 340                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 341                                          scalar_type);
 342                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 343                     }
 344                   return false;
 345                 }
 346               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 347
 348               if (dump_enabled_p ())
 349                 {
 350                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 351                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 352                   dump_printf (MSG_NOTE, "\n");
 353                 }
 354
 355               if (dump_enabled_p ())
 356                 {
 357                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 358                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 359                   dump_printf (MSG_NOTE, "\n");
 360                 }
 361
 362               vect_update_max_nunits (&vectorization_factor, vectype);
 363             }
 364         }
 365
 366       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 367            gsi_next (&si))
 368         {
 369           stmt_info = vinfo_for_stmt (gsi_stmt (si));
 370           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 371                                            &mask_producers))
 372             return false;
 373         }
 374     }
 375
 376   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 377   if (dump_enabled_p ())
 378     {
 379       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 380       dump_dec (MSG_NOTE, vectorization_factor);
 381       dump_printf (MSG_NOTE, "\n");
 382     }
 383
 384   if (known_le (vectorization_factor, 1U))
 385     {
 386       if (dump_enabled_p ())
 387         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 388                          "not vectorized: unsupported data-type\n");
 389       return false;
 390     }
 391   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 392
 393   for (i = 0; i < mask_producers.length (); i++)
 394     {
 395       stmt_info = mask_producers[i];
 396       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 397       if (!mask_type)
 398         return false;
 399       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 400     }
 401
 402   return true;
 403 }
 404
 405
 406 /* Function vect_is_simple_iv_evolution.
 407
 408    FORNOW: A simple evolution of an induction variables in the loop is
 409    considered a polynomial evolution.  */
 410
 411 static bool
 412 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 413                              tree * step)
 414 {
 415   tree init_expr;
 416   tree step_expr;
 417   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 418   basic_block bb;
 419
 420   /* When there is no evolution in this loop, the evolution function
 421      is not "simple".  */
 422   if (evolution_part == NULL_TREE)
 423     return false;
 424
 425   /* When the evolution is a polynomial of degree >= 2
 426      the evolution function is not "simple".  */
 427   if (tree_is_chrec (evolution_part))
 428     return false;
 429
 430   step_expr = evolution_part;
 431   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 432
 433   if (dump_enabled_p ())
 434     {
 435       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 436       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 437       dump_printf (MSG_NOTE, ",  init: ");
 438       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 439       dump_printf (MSG_NOTE, "\n");
 440     }
 441
 442   *init = init_expr;
 443   *step = step_expr;
 444
 445   if (TREE_CODE (step_expr) != INTEGER_CST
 446       && (TREE_CODE (step_expr) != SSA_NAME
 447           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 448               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 449           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 450               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 451                   || !flag_associative_math)))
 452       && (TREE_CODE (step_expr) != REAL_CST
 453           || !flag_associative_math))
 454     {
 455       if (dump_enabled_p ())
 456         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 457                          "step unknown.\n");
 458       return false;
 459     }
 460
 461   return true;
 462 }
 463
 464 /* Function vect_analyze_scalar_cycles_1.
 465
 466    Examine the cross iteration def-use cycles of scalar variables
 467    in LOOP.  LOOP_VINFO represents the loop that is now being
 468    considered for vectorization (can be LOOP, or an outer-loop
 469    enclosing LOOP).  */
 470
 471 static void
 472 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 473 {
 474   basic_block bb = loop->header;
 475   tree init, step;
 476   auto_vec<gimple *, 64> worklist;
 477   gphi_iterator gsi;
 478   bool double_reduc;
 479
 480   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 481
 482   /* First - identify all inductions.  Reduction detection assumes that all the
 483      inductions have been identified, therefore, this order must not be
 484      changed.  */
 485   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 486     {
 487       gphi *phi = gsi.phi ();
 488       tree access_fn = NULL;
 489       tree def = PHI_RESULT (phi);
 490       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 491
 492       if (dump_enabled_p ())
 493         {
 494           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 495           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 496         }
 497
 498       /* Skip virtual phi's.  The data dependences that are associated with
 499          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 500       if (virtual_operand_p (def))
 501         continue;
 502
 503       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 504
 505       /* Analyze the evolution function.  */
 506       access_fn = analyze_scalar_evolution (loop, def);
 507       if (access_fn)
 508         {
 509           STRIP_NOPS (access_fn);
 510           if (dump_enabled_p ())
 511             {
 512               dump_printf_loc (MSG_NOTE, vect_location,
 513                                "Access function of PHI: ");
 514               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 515               dump_printf (MSG_NOTE, "\n");
 516             }
 517           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 518             = initial_condition_in_loop_num (access_fn, loop->num);
 519           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 520             = evolution_part_in_loop_num (access_fn, loop->num);
 521         }
 522
 523       if (!access_fn
 524           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 525           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 526               && TREE_CODE (step) != INTEGER_CST))
 527         {
 528           worklist.safe_push (phi);
 529           continue;
 530         }
 531
 532       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 533                   != NULL_TREE);
 534       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 535
 536       if (dump_enabled_p ())
 537         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 538       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 539     }
 540
 541
 542   /* Second - identify all reductions and nested cycles.  */
 543   while (worklist.length () > 0)
 544     {
 545       gimple *phi = worklist.pop ();
 546       tree def = PHI_RESULT (phi);
 547       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 548       gimple *reduc_stmt;
 549
 550       if (dump_enabled_p ())
 551         {
 552           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 553           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 554         }
 555
 556       gcc_assert (!virtual_operand_p (def)
 557                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 558
 559       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 560                                                 &double_reduc, false);
 561       if (reduc_stmt)
 562         {
 563           if (double_reduc)
 564             {
 565               if (dump_enabled_p ())
 566                 dump_printf_loc (MSG_NOTE, vect_location,
 567                                  "Detected double reduction.\n");
 568
 569               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 570               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 571                                                     vect_double_reduction_def;
 572             }
 573           else
 574             {
 575               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 576                 {
 577                   if (dump_enabled_p ())
 578                     dump_printf_loc (MSG_NOTE, vect_location,
 579                                      "Detected vectorizable nested cycle.\n");
 580
 581                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 582                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 583                                                              vect_nested_cycle;
 584                 }
 585               else
 586                 {
 587                   if (dump_enabled_p ())
 588                     dump_printf_loc (MSG_NOTE, vect_location,
 589                                      "Detected reduction.\n");
 590
 591                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 592                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 593                                                            vect_reduction_def;
 594                   /* Store the reduction cycles for possible vectorization in
 595                      loop-aware SLP if it was not detected as reduction
 596                      chain.  */
 597                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 598                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 599                 }
 600             }
 601         }
 602       else
 603         if (dump_enabled_p ())
 604           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 605                            "Unknown def-use cycle pattern.\n");
 606     }
 607 }
 608
 609
 610 /* Function vect_analyze_scalar_cycles.
 611
 612    Examine the cross iteration def-use cycles of scalar variables, by
 613    analyzing the loop-header PHIs of scalar variables.  Classify each
 614    cycle as one of the following: invariant, induction, reduction, unknown.
 615    We do that for the loop represented by LOOP_VINFO, and also to its
 616    inner-loop, if exists.
 617    Examples for scalar cycles:
 618
 619    Example1: reduction:
 620
 621               loop1:
 622               for (i=0; i<N; i++)
 623                  sum += a[i];
 624
 625    Example2: induction:
 626
 627               loop2:
 628               for (i=0; i<N; i++)
 629                  a[i] = i;  */
 630
 631 static void
 632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 633 {
 634   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 635
 636   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 637
 638   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 639      Reductions in such inner-loop therefore have different properties than
 640      the reductions in the nest that gets vectorized:
 641      1. When vectorized, they are executed in the same order as in the original
 642         scalar loop, so we can't change the order of computation when
 643         vectorizing them.
 644      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 645         current checks are too strict.  */
 646
 647   if (loop->inner)
 648     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 649 }
 650
 651 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 652
 653 static void
 654 vect_fixup_reduc_chain (gimple *stmt)
 655 {
 656   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 657   gimple *stmtp;
 658   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 659               && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 660   REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
 661     = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
 662   do
 663     {
 664       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 665       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 666       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 667       if (stmt)
 668         REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 669           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 670     }
 671   while (stmt);
 672   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 673 }
 674
 675 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 676
 677 static void
 678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 679 {
 680   gimple *first;
 681   unsigned i;
 682
 683   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 684     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 685       {
 686         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 687         while (next)
 688           {
 689             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 690               break;
 691             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 692           }
 693         /* If not all stmt in the chain are patterns try to handle
 694            the chain without patterns.  */
 695         if (! next)
 696           {
 697             vect_fixup_reduc_chain (first);
 698             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 699               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 700           }
 701       }
 702 }
 703
 704 /* Function vect_get_loop_niters.
 705
 706    Determine how many iterations the loop is executed and place it
 707    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 708    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 709    niter information holds in ASSUMPTIONS.
 710
 711    Return the loop exit condition.  */
 712
 713
 714 static gcond *
 715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 716                       tree *number_of_iterations, tree *number_of_iterationsm1)
 717 {
 718   edge exit = single_exit (loop);
 719   struct tree_niter_desc niter_desc;
 720   tree niter_assumptions, niter, may_be_zero;
 721   gcond *cond = get_loop_exit_condition (loop);
 722
 723   *assumptions = boolean_true_node;
 724   *number_of_iterationsm1 = chrec_dont_know;
 725   *number_of_iterations = chrec_dont_know;
 726   DUMP_VECT_SCOPE ("get_loop_niters");
 727
 728   if (!exit)
 729     return cond;
 730
 731   niter = chrec_dont_know;
 732   may_be_zero = NULL_TREE;
 733   niter_assumptions = boolean_true_node;
 734   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 735       || chrec_contains_undetermined (niter_desc.niter))
 736     return cond;
 737
 738   niter_assumptions = niter_desc.assumptions;
 739   may_be_zero = niter_desc.may_be_zero;
 740   niter = niter_desc.niter;
 741
 742   if (may_be_zero && integer_zerop (may_be_zero))
 743     may_be_zero = NULL_TREE;
 744
 745   if (may_be_zero)
 746     {
 747       if (COMPARISON_CLASS_P (may_be_zero))
 748         {
 749           /* Try to combine may_be_zero with assumptions, this can simplify
 750              computation of niter expression.  */
 751           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 752             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 753                                              niter_assumptions,
 754                                              fold_build1 (TRUTH_NOT_EXPR,
 755                                                           boolean_type_node,
 756                                                           may_be_zero));
 757           else
 758             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 759                                  build_int_cst (TREE_TYPE (niter), 0),
 760                                  rewrite_to_non_trapping_overflow (niter));
 761
 762           may_be_zero = NULL_TREE;
 763         }
 764       else if (integer_nonzerop (may_be_zero))
 765         {
 766           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 767           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 768           return cond;
 769         }
 770       else
 771         return cond;
 772     }
 773
 774   *assumptions = niter_assumptions;
 775   *number_of_iterationsm1 = niter;
 776
 777   /* We want the number of loop header executions which is the number
 778      of latch executions plus one.
 779      ???  For UINT_MAX latch executions this number overflows to zero
 780      for loops like do { n++; } while (n != 0);  */
 781   if (niter && !chrec_contains_undetermined (niter))
 782     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 783                           build_int_cst (TREE_TYPE (niter), 1));
 784   *number_of_iterations = niter;
 785
 786   return cond;
 787 }
 788
 789 /* Function bb_in_loop_p
 790
 791    Used as predicate for dfs order traversal of the loop bbs.  */
 792
 793 static bool
 794 bb_in_loop_p (const_basic_block bb, const void *data)
 795 {
 796   const struct loop *const loop = (const struct loop *)data;
 797   if (flow_bb_inside_loop_p (loop, bb))
 798     return true;
 799   return false;
 800 }
 801
 802
 803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 804    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 805
 806 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
 807   : vec_info (vec_info::loop, init_cost (loop_in)),
 808     loop (loop_in),
 809     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 810     num_itersm1 (NULL_TREE),
 811     num_iters (NULL_TREE),
 812     num_iters_unchanged (NULL_TREE),
 813     num_iters_assumptions (NULL_TREE),
 814     th (0),
 815     versioning_threshold (0),
 816     vectorization_factor (0),
 817     max_vectorization_factor (0),
 818     mask_skip_niters (NULL_TREE),
 819     mask_compare_type (NULL_TREE),
 820     unaligned_dr (NULL),
 821     peeling_for_alignment (0),
 822     ptr_mask (0),
 823     ivexpr_map (NULL),
 824     slp_unrolling_factor (1),
 825     single_scalar_iteration_cost (0),
 826     vectorizable (false),
 827     can_fully_mask_p (true),
 828     fully_masked_p (false),
 829     peeling_for_gaps (false),
 830     peeling_for_niter (false),
 831     operands_swapped (false),
 832     no_data_dependencies (false),
 833     has_mask_store (false),
 834     scalar_loop (NULL),
 835     orig_loop_info (NULL)
 836 {
 837   /* Create/Update stmt_info for all stmts in the loop.  */
 838   basic_block *body = get_loop_body (loop);
 839   for (unsigned int i = 0; i < loop->num_nodes; i++)
 840     {
 841       basic_block bb = body[i];
 842       gimple_stmt_iterator si;
 843
 844       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 845         {
 846           gimple *phi = gsi_stmt (si);
 847           gimple_set_uid (phi, 0);
 848           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
 849         }
 850
 851       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 852         {
 853           gimple *stmt = gsi_stmt (si);
 854           gimple_set_uid (stmt, 0);
 855           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
 856         }
 857     }
 858   free (body);
 859
 860   /* CHECKME: We want to visit all BBs before their successors (except for
 861      latch blocks, for which this assertion wouldn't hold).  In the simple
 862      case of the loop forms we allow, a dfs order of the BBs would the same
 863      as reversed postorder traversal, so we are safe.  */
 864
 865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 866                                           bbs, loop->num_nodes, loop);
 867   gcc_assert (nbbs == loop->num_nodes);
 868 }
 869
 870 /* Free all levels of MASKS.  */
 871
 872 void
 873 release_vec_loop_masks (vec_loop_masks *masks)
 874 {
 875   rgroup_masks *rgm;
 876   unsigned int i;
 877   FOR_EACH_VEC_ELT (*masks, i, rgm)
 878     rgm->masks.release ();
 879   masks->release ();
 880 }
 881
 882 /* Free all memory used by the _loop_vec_info, as well as all the
 883    stmt_vec_info structs of all the stmts in the loop.  */
 884
 885 _loop_vec_info::~_loop_vec_info ()
 886 {
 887   int nbbs;
 888   gimple_stmt_iterator si;
 889   int j;
 890
 891   /* ???  We're releasing loop_vinfos en-block.  */
 892   set_stmt_vec_info_vec (&stmt_vec_infos);
 893   nbbs = loop->num_nodes;
 894   for (j = 0; j < nbbs; j++)
 895     {
 896       basic_block bb = bbs[j];
 897       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 898         free_stmt_vec_info (gsi_stmt (si));
 899
 900       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 901         {
 902           gimple *stmt = gsi_stmt (si);
 903
 904           /* We may have broken canonical form by moving a constant
 905              into RHS1 of a commutative op.  Fix such occurrences.  */
 906           if (operands_swapped && is_gimple_assign (stmt))
 907             {
 908               enum tree_code code = gimple_assign_rhs_code (stmt);
 909
 910               if ((code == PLUS_EXPR
 911                    || code == POINTER_PLUS_EXPR
 912                    || code == MULT_EXPR)
 913                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 914                 swap_ssa_operands (stmt,
 915                                    gimple_assign_rhs1_ptr (stmt),
 916                                    gimple_assign_rhs2_ptr (stmt));
 917               else if (code == COND_EXPR
 918                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 919                 {
 920                   tree cond_expr = gimple_assign_rhs1 (stmt);
 921                   enum tree_code cond_code = TREE_CODE (cond_expr);
 922
 923                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 924                     {
 925                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 926                                                                   0));
 927                       cond_code = invert_tree_comparison (cond_code,
 928                                                           honor_nans);
 929                       if (cond_code != ERROR_MARK)
 930                         {
 931                           TREE_SET_CODE (cond_expr, cond_code);
 932                           swap_ssa_operands (stmt,
 933                                              gimple_assign_rhs2_ptr (stmt),
 934                                              gimple_assign_rhs3_ptr (stmt));
 935                         }
 936                     }
 937                 }
 938             }
 939
 940           /* Free stmt_vec_info.  */
 941           free_stmt_vec_info (stmt);
 942           gsi_next (&si);
 943         }
 944     }
 945
 946   free (bbs);
 947
 948   release_vec_loop_masks (&masks);
 949   delete ivexpr_map;
 950
 951   loop->aux = NULL;
 952 }
 953
 954 /* Return an invariant or register for EXPR and emit necessary
 955    computations in the LOOP_VINFO loop preheader.  */
 956
 957 tree
 958 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 959 {
 960   if (is_gimple_reg (expr)
 961       || is_gimple_min_invariant (expr))
 962     return expr;
 963
 964   if (! loop_vinfo->ivexpr_map)
 965     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 966   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 967   if (! cached)
 968     {
 969       gimple_seq stmts = NULL;
 970       cached = force_gimple_operand (unshare_expr (expr),
 971                                      &stmts, true, NULL_TREE);
 972       if (stmts)
 973         {
 974           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 975           gsi_insert_seq_on_edge_immediate (e, stmts);
 976         }
 977     }
 978   return cached;
 979 }
 980
 981 /* Return true if we can use CMP_TYPE as the comparison type to produce
 982    all masks required to mask LOOP_VINFO.  */
 983
 984 static bool
 985 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 986 {
 987   rgroup_masks *rgm;
 988   unsigned int i;
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 990     if (rgm->mask_type != NULL_TREE
 991         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 992                                             cmp_type, rgm->mask_type,
 993                                             OPTIMIZE_FOR_SPEED))
 994       return false;
 995   return true;
 996 }
 997
 998 /* Calculate the maximum number of scalars per iteration for every
 999    rgroup in LOOP_VINFO.  */
1000
1001 static unsigned int
1002 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1003 {
1004   unsigned int res = 1;
1005   unsigned int i;
1006   rgroup_masks *rgm;
1007   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1008     res = MAX (res, rgm->max_nscalars_per_iter);
1009   return res;
1010 }
1011
1012 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1013    whether we can actually generate the masks required.  Return true if so,
1014    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1015
1016 static bool
1017 vect_verify_full_masking (loop_vec_info loop_vinfo)
1018 {
1019   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020   unsigned int min_ni_width;
1021
1022   /* Use a normal loop if there are no statements that need masking.
1023      This only happens in rare degenerate cases: it means that the loop
1024      has no loads, no stores, and no live-out values.  */
1025   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1026     return false;
1027
1028   /* Get the maximum number of iterations that is representable
1029      in the counter type.  */
1030   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1031   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1032
1033   /* Get a more refined estimate for the number of iterations.  */
1034   widest_int max_back_edges;
1035   if (max_loop_iterations (loop, &max_back_edges))
1036     max_ni = wi::smin (max_ni, max_back_edges + 1);
1037
1038   /* Account for rgroup masks, in which each bit is replicated N times.  */
1039   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1040
1041   /* Work out how many bits we need to represent the limit.  */
1042   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1043
1044   /* Find a scalar mode for which WHILE_ULT is supported.  */
1045   opt_scalar_int_mode cmp_mode_iter;
1046   tree cmp_type = NULL_TREE;
1047   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1048     {
1049       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1050       if (cmp_bits >= min_ni_width
1051           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1052         {
1053           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1054           if (this_type
1055               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1056             {
1057               /* Although we could stop as soon as we find a valid mode,
1058                  it's often better to continue until we hit Pmode, since the
1059                  operands to the WHILE are more likely to be reusable in
1060                  address calculations.  */
1061               cmp_type = this_type;
1062               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1063                 break;
1064             }
1065         }
1066     }
1067
1068   if (!cmp_type)
1069     return false;
1070
1071   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1072   return true;
1073 }
1074
1075 /* Calculate the cost of one scalar iteration of the loop.  */
1076 static void
1077 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1078 {
1079   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1080   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1081   int nbbs = loop->num_nodes, factor;
1082   int innerloop_iters, i;
1083
1084   /* Gather costs for statements in the scalar loop.  */
1085
1086   /* FORNOW.  */
1087   innerloop_iters = 1;
1088   if (loop->inner)
1089     innerloop_iters = 50; /* FIXME */
1090
1091   for (i = 0; i < nbbs; i++)
1092     {
1093       gimple_stmt_iterator si;
1094       basic_block bb = bbs[i];
1095
1096       if (bb->loop_father == loop->inner)
1097         factor = innerloop_iters;
1098       else
1099         factor = 1;
1100
1101       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1102         {
1103           gimple *stmt = gsi_stmt (si);
1104           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1105
1106           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1107             continue;
1108
1109           /* Skip stmts that are not vectorized inside the loop.  */
1110           if (stmt_info
1111               && !STMT_VINFO_RELEVANT_P (stmt_info)
1112               && (!STMT_VINFO_LIVE_P (stmt_info)
1113                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1114               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1115             continue;
1116
1117           vect_cost_for_stmt kind;
1118           if (STMT_VINFO_DATA_REF (stmt_info))
1119             {
1120               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1121                kind = scalar_load;
1122              else
1123                kind = scalar_store;
1124             }
1125           else
1126             kind = scalar_stmt;
1127
1128           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129                             factor, kind, stmt_info, 0, vect_prologue);
1130         }
1131     }
1132
1133   /* Now accumulate cost.  */
1134   void *target_cost_data = init_cost (loop);
1135   stmt_info_for_cost *si;
1136   int j;
1137   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1138                     j, si)
1139     {
1140       struct _stmt_vec_info *stmt_info
1141         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1142       (void) add_stmt_cost (target_cost_data, si->count,
1143                             si->kind, stmt_info, si->misalign,
1144                             vect_body);
1145     }
1146   unsigned dummy, body_cost = 0;
1147   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1148   destroy_cost_data (target_cost_data);
1149   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1150 }
1151
1152
1153 /* Function vect_analyze_loop_form_1.
1154
1155    Verify that certain CFG restrictions hold, including:
1156    - the loop has a pre-header
1157    - the loop has a single entry and exit
1158    - the loop exit condition is simple enough
1159    - the number of iterations can be analyzed, i.e, a countable loop.  The
1160      niter could be analyzed under some assumptions.  */
1161
1162 bool
1163 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1164                           tree *assumptions, tree *number_of_iterationsm1,
1165                           tree *number_of_iterations, gcond **inner_loop_cond)
1166 {
1167   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1168
1169   /* Different restrictions apply when we are considering an inner-most loop,
1170      vs. an outer (nested) loop.
1171      (FORNOW. May want to relax some of these restrictions in the future).  */
1172
1173   if (!loop->inner)
1174     {
1175       /* Inner-most loop.  We currently require that the number of BBs is
1176          exactly 2 (the header and latch).  Vectorizable inner-most loops
1177          look like this:
1178
1179                         (pre-header)
1180                            |
1181                           header <--------+
1182                            | |            |
1183                            | +--> latch --+
1184                            |
1185                         (exit-bb)  */
1186
1187       if (loop->num_nodes != 2)
1188         {
1189           if (dump_enabled_p ())
1190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191                              "not vectorized: control flow in loop.\n");
1192           return false;
1193         }
1194
1195       if (empty_block_p (loop->header))
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: empty loop.\n");
1200           return false;
1201         }
1202     }
1203   else
1204     {
1205       struct loop *innerloop = loop->inner;
1206       edge entryedge;
1207
1208       /* Nested loop. We currently require that the loop is doubly-nested,
1209          contains a single inner loop, and the number of BBs is exactly 5.
1210          Vectorizable outer-loops look like this:
1211
1212                         (pre-header)
1213                            |
1214                           header <---+
1215                            |         |
1216                           inner-loop |
1217                            |         |
1218                           tail ------+
1219                            |
1220                         (exit-bb)
1221
1222          The inner-loop has the properties expected of inner-most loops
1223          as described above.  */
1224
1225       if ((loop->inner)->inner || (loop->inner)->next)
1226         {
1227           if (dump_enabled_p ())
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: multiple nested loops.\n");
1230           return false;
1231         }
1232
1233       if (loop->num_nodes != 5)
1234         {
1235           if (dump_enabled_p ())
1236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237                              "not vectorized: control flow in loop.\n");
1238           return false;
1239         }
1240
1241       entryedge = loop_preheader_edge (innerloop);
1242       if (entryedge->src != loop->header
1243           || !single_exit (innerloop)
1244           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1245         {
1246           if (dump_enabled_p ())
1247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248                              "not vectorized: unsupported outerloop form.\n");
1249           return false;
1250         }
1251
1252       /* Analyze the inner-loop.  */
1253       tree inner_niterm1, inner_niter, inner_assumptions;
1254       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1255                                       &inner_assumptions, &inner_niterm1,
1256                                       &inner_niter, NULL)
1257           /* Don't support analyzing niter under assumptions for inner
1258              loop.  */
1259           || !integer_onep (inner_assumptions))
1260         {
1261           if (dump_enabled_p ())
1262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263                              "not vectorized: Bad inner loop.\n");
1264           return false;
1265         }
1266
1267       if (!expr_invariant_in_loop_p (loop, inner_niter))
1268         {
1269           if (dump_enabled_p ())
1270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                              "not vectorized: inner-loop count not"
1272                              " invariant.\n");
1273           return false;
1274         }
1275
1276       if (dump_enabled_p ())
1277         dump_printf_loc (MSG_NOTE, vect_location,
1278                          "Considering outer-loop vectorization.\n");
1279     }
1280
1281   if (!single_exit (loop)
1282       || EDGE_COUNT (loop->header->preds) != 2)
1283     {
1284       if (dump_enabled_p ())
1285         {
1286           if (!single_exit (loop))
1287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1288                              "not vectorized: multiple exits.\n");
1289           else if (EDGE_COUNT (loop->header->preds) != 2)
1290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                              "not vectorized: too many incoming edges.\n");
1292         }
1293       return false;
1294     }
1295
1296   /* We assume that the loop exit condition is at the end of the loop. i.e,
1297      that the loop is represented as a do-while (with a proper if-guard
1298      before the loop if needed), where the loop header contains all the
1299      executable statements, and the latch is empty.  */
1300   if (!empty_block_p (loop->latch)
1301       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1302     {
1303       if (dump_enabled_p ())
1304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1305                          "not vectorized: latch block not empty.\n");
1306       return false;
1307     }
1308
1309   /* Make sure the exit is not abnormal.  */
1310   edge e = single_exit (loop);
1311   if (e->flags & EDGE_ABNORMAL)
1312     {
1313       if (dump_enabled_p ())
1314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315                          "not vectorized: abnormal loop exit edge.\n");
1316       return false;
1317     }
1318
1319   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1320                                      number_of_iterationsm1);
1321   if (!*loop_cond)
1322     {
1323       if (dump_enabled_p ())
1324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                          "not vectorized: complicated exit condition.\n");
1326       return false;
1327     }
1328
1329   if (integer_zerop (*assumptions)
1330       || !*number_of_iterations
1331       || chrec_contains_undetermined (*number_of_iterations))
1332     {
1333       if (dump_enabled_p ())
1334         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1335                          "not vectorized: number of iterations cannot be "
1336                          "computed.\n");
1337       return false;
1338     }
1339
1340   if (integer_zerop (*number_of_iterations))
1341     {
1342       if (dump_enabled_p ())
1343         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344                          "not vectorized: number of iterations = 0.\n");
1345       return false;
1346     }
1347
1348   return true;
1349 }
1350
1351 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1352
1353 loop_vec_info
1354 vect_analyze_loop_form (struct loop *loop)
1355 {
1356   tree assumptions, number_of_iterations, number_of_iterationsm1;
1357   gcond *loop_cond, *inner_loop_cond = NULL;
1358
1359   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1360                                   &assumptions, &number_of_iterationsm1,
1361                                   &number_of_iterations, &inner_loop_cond))
1362     return NULL;
1363
1364   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1365   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1366   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1367   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1368   if (!integer_onep (assumptions))
1369     {
1370       /* We consider to vectorize this loop by versioning it under
1371          some assumptions.  In order to do this, we need to clear
1372          existing information computed by scev and niter analyzer.  */
1373       scev_reset_htab ();
1374       free_numbers_of_iterations_estimates (loop);
1375       /* Also set flag for this loop so that following scev and niter
1376          analysis are done under the assumptions.  */
1377       loop_constraint_set (loop, LOOP_C_FINITE);
1378       /* Also record the assumptions for versioning.  */
1379       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1380     }
1381
1382   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1383     {
1384       if (dump_enabled_p ())
1385         {
1386           dump_printf_loc (MSG_NOTE, vect_location,
1387                            "Symbolic number of iterations is ");
1388           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1389           dump_printf (MSG_NOTE, "\n");
1390         }
1391     }
1392
1393   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1394   if (inner_loop_cond)
1395     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1396       = loop_exit_ctrl_vec_info_type;
1397
1398   gcc_assert (!loop->aux);
1399   loop->aux = loop_vinfo;
1400   return loop_vinfo;
1401 }
1402
1403
1404
1405 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1406    statements update the vectorization factor.  */
1407
1408 static void
1409 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1410 {
1411   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1412   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1413   int nbbs = loop->num_nodes;
1414   poly_uint64 vectorization_factor;
1415   int i;
1416
1417   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1418
1419   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1420   gcc_assert (known_ne (vectorization_factor, 0U));
1421
1422   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1423      vectorization factor of the loop is the unrolling factor required by
1424      the SLP instances.  If that unrolling factor is 1, we say, that we
1425      perform pure SLP on loop - cross iteration parallelism is not
1426      exploited.  */
1427   bool only_slp_in_loop = true;
1428   for (i = 0; i < nbbs; i++)
1429     {
1430       basic_block bb = bbs[i];
1431       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1432            gsi_next (&si))
1433         {
1434           gimple *stmt = gsi_stmt (si);
1435           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1436           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1437               && STMT_VINFO_RELATED_STMT (stmt_info))
1438             {
1439               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1440               stmt_info = vinfo_for_stmt (stmt);
1441             }
1442           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1443                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1444               && !PURE_SLP_STMT (stmt_info))
1445             /* STMT needs both SLP and loop-based vectorization.  */
1446             only_slp_in_loop = false;
1447         }
1448     }
1449
1450   if (only_slp_in_loop)
1451     {
1452       dump_printf_loc (MSG_NOTE, vect_location,
1453                        "Loop contains only SLP stmts\n");
1454       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1455     }
1456   else
1457     {
1458       dump_printf_loc (MSG_NOTE, vect_location,
1459                        "Loop contains SLP and non-SLP stmts\n");
1460       /* Both the vectorization factor and unroll factor have the form
1461          current_vector_size * X for some rational X, so they must have
1462          a common multiple.  */
1463       vectorization_factor
1464         = force_common_multiple (vectorization_factor,
1465                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1466     }
1467
1468   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1469   if (dump_enabled_p ())
1470     {
1471       dump_printf_loc (MSG_NOTE, vect_location,
1472                        "Updating vectorization factor to ");
1473       dump_dec (MSG_NOTE, vectorization_factor);
1474       dump_printf (MSG_NOTE, ".\n");
1475     }
1476 }
1477
1478 /* Return true if STMT_INFO describes a double reduction phi and if
1479    the other phi in the reduction is also relevant for vectorization.
1480    This rejects cases such as:
1481
1482       outer1:
1483         x_1 = PHI <x_3(outer2), ...>;
1484         ...
1485
1486       inner:
1487         x_2 = ...;
1488         ...
1489
1490       outer2:
1491         x_3 = PHI <x_2(inner)>;
1492
1493    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1494
1495 static bool
1496 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1497 {
1498   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1499     return false;
1500
1501   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1502   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1503 }
1504
1505 /* Function vect_analyze_loop_operations.
1506
1507    Scan the loop stmts and make sure they are all vectorizable.  */
1508
1509 static bool
1510 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1511 {
1512   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1513   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1514   int nbbs = loop->num_nodes;
1515   int i;
1516   stmt_vec_info stmt_info;
1517   bool need_to_vectorize = false;
1518   bool ok;
1519
1520   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1521
1522   stmt_vector_for_cost cost_vec;
1523   cost_vec.create (2);
1524
1525   for (i = 0; i < nbbs; i++)
1526     {
1527       basic_block bb = bbs[i];
1528
1529       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1530            gsi_next (&si))
1531         {
1532           gphi *phi = si.phi ();
1533           ok = true;
1534
1535           stmt_info = vinfo_for_stmt (phi);
1536           if (dump_enabled_p ())
1537             {
1538               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1539               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1540             }
1541           if (virtual_operand_p (gimple_phi_result (phi)))
1542             continue;
1543
1544           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1545              (i.e., a phi in the tail of the outer-loop).  */
1546           if (! is_loop_header_bb_p (bb))
1547             {
1548               /* FORNOW: we currently don't support the case that these phis
1549                  are not used in the outerloop (unless it is double reduction,
1550                  i.e., this phi is vect_reduction_def), cause this case
1551                  requires to actually do something here.  */
1552               if (STMT_VINFO_LIVE_P (stmt_info)
1553                   && !vect_active_double_reduction_p (stmt_info))
1554                 {
1555                   if (dump_enabled_p ())
1556                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1557                                      "Unsupported loop-closed phi in "
1558                                      "outer-loop.\n");
1559                   return false;
1560                 }
1561
1562               /* If PHI is used in the outer loop, we check that its operand
1563                  is defined in the inner loop.  */
1564               if (STMT_VINFO_RELEVANT_P (stmt_info))
1565                 {
1566                   tree phi_op;
1567                   gimple *op_def_stmt;
1568
1569                   if (gimple_phi_num_args (phi) != 1)
1570                     return false;
1571
1572                   phi_op = PHI_ARG_DEF (phi, 0);
1573                   if (TREE_CODE (phi_op) != SSA_NAME)
1574                     return false;
1575
1576                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1577                   if (gimple_nop_p (op_def_stmt)
1578                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1579                       || !vinfo_for_stmt (op_def_stmt))
1580                     return false;
1581
1582                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1583                         != vect_used_in_outer
1584                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1585                            != vect_used_in_outer_by_reduction)
1586                     return false;
1587                 }
1588
1589               continue;
1590             }
1591
1592           gcc_assert (stmt_info);
1593
1594           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1595                || STMT_VINFO_LIVE_P (stmt_info))
1596               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1597             {
1598               /* A scalar-dependence cycle that we don't support.  */
1599               if (dump_enabled_p ())
1600                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601                                  "not vectorized: scalar dependence cycle.\n");
1602               return false;
1603             }
1604
1605           if (STMT_VINFO_RELEVANT_P (stmt_info))
1606             {
1607               need_to_vectorize = true;
1608               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1609                   && ! PURE_SLP_STMT (stmt_info))
1610                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1611               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1612                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1613                        && ! PURE_SLP_STMT (stmt_info))
1614                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1615                                              &cost_vec);
1616             }
1617
1618           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1619           if (ok
1620               && STMT_VINFO_LIVE_P (stmt_info)
1621               && !PURE_SLP_STMT (stmt_info))
1622             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1623                                               &cost_vec);
1624
1625           if (!ok)
1626             {
1627               if (dump_enabled_p ())
1628                 {
1629                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630                                    "not vectorized: relevant phi not "
1631                                    "supported: ");
1632                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1633                 }
1634               return false;
1635             }
1636         }
1637
1638       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1639            gsi_next (&si))
1640         {
1641           gimple *stmt = gsi_stmt (si);
1642           if (!gimple_clobber_p (stmt)
1643               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1644                                      &cost_vec))
1645             return false;
1646         }
1647     } /* bbs */
1648
1649   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1650   cost_vec.release ();
1651
1652   /* All operations in the loop are either irrelevant (deal with loop
1653      control, or dead), or only used outside the loop and can be moved
1654      out of the loop (e.g. invariants, inductions).  The loop can be
1655      optimized away by scalar optimizations.  We're better off not
1656      touching this loop.  */
1657   if (!need_to_vectorize)
1658     {
1659       if (dump_enabled_p ())
1660         dump_printf_loc (MSG_NOTE, vect_location,
1661                          "All the computation can be taken out of the loop.\n");
1662       if (dump_enabled_p ())
1663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1664                          "not vectorized: redundant loop. no profit to "
1665                          "vectorize.\n");
1666       return false;
1667     }
1668
1669   return true;
1670 }
1671
1672 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1673    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1674    definitely no, or -1 if it's worth retrying.  */
1675
1676 static int
1677 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1678 {
1679   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1680   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1681
1682   /* Only fully-masked loops can have iteration counts less than the
1683      vectorization factor.  */
1684   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1685     {
1686       HOST_WIDE_INT max_niter;
1687
1688       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1689         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1690       else
1691         max_niter = max_stmt_executions_int (loop);
1692
1693       if (max_niter != -1
1694           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1695         {
1696           if (dump_enabled_p ())
1697             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698                              "not vectorized: iteration count smaller than "
1699                              "vectorization factor.\n");
1700           return 0;
1701         }
1702     }
1703
1704   int min_profitable_iters, min_profitable_estimate;
1705   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1706                                       &min_profitable_estimate);
1707
1708   if (min_profitable_iters < 0)
1709     {
1710       if (dump_enabled_p ())
1711         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1712                          "not vectorized: vectorization not profitable.\n");
1713       if (dump_enabled_p ())
1714         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1715                          "not vectorized: vector version will never be "
1716                          "profitable.\n");
1717       return -1;
1718     }
1719
1720   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1721                                * assumed_vf);
1722
1723   /* Use the cost model only if it is more conservative than user specified
1724      threshold.  */
1725   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1726                                     min_profitable_iters);
1727
1728   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1729
1730   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1731       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1732     {
1733       if (dump_enabled_p ())
1734         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735                          "not vectorized: vectorization not profitable.\n");
1736       if (dump_enabled_p ())
1737         dump_printf_loc (MSG_NOTE, vect_location,
1738                          "not vectorized: iteration count smaller than user "
1739                          "specified loop bound parameter or minimum profitable "
1740                          "iterations (whichever is more conservative).\n");
1741       return 0;
1742     }
1743
1744   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1745   if (estimated_niter == -1)
1746     estimated_niter = likely_max_stmt_executions_int (loop);
1747   if (estimated_niter != -1
1748       && ((unsigned HOST_WIDE_INT) estimated_niter
1749           < MAX (th, (unsigned) min_profitable_estimate)))
1750     {
1751       if (dump_enabled_p ())
1752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753                          "not vectorized: estimated iteration count too "
1754                          "small.\n");
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_NOTE, vect_location,
1757                          "not vectorized: estimated iteration count smaller "
1758                          "than specified loop bound parameter or minimum "
1759                          "profitable iterations (whichever is more "
1760                          "conservative).\n");
1761       return -1;
1762     }
1763
1764   return 1;
1765 }
1766
1767 static bool
1768 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1769                            vec<data_reference_p> *datarefs,
1770                            unsigned int *n_stmts)
1771 {
1772   *n_stmts = 0;
1773   for (unsigned i = 0; i < loop->num_nodes; i++)
1774     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1775          !gsi_end_p (gsi); gsi_next (&gsi))
1776       {
1777         gimple *stmt = gsi_stmt (gsi);
1778         if (is_gimple_debug (stmt))
1779           continue;
1780         ++(*n_stmts);
1781         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1782           {
1783             if (is_gimple_call (stmt) && loop->safelen)
1784               {
1785                 tree fndecl = gimple_call_fndecl (stmt), op;
1786                 if (fndecl != NULL_TREE)
1787                   {
1788                     cgraph_node *node = cgraph_node::get (fndecl);
1789                     if (node != NULL && node->simd_clones != NULL)
1790                       {
1791                         unsigned int j, n = gimple_call_num_args (stmt);
1792                         for (j = 0; j < n; j++)
1793                           {
1794                             op = gimple_call_arg (stmt, j);
1795                             if (DECL_P (op)
1796                                 || (REFERENCE_CLASS_P (op)
1797                                     && get_base_address (op)))
1798                               break;
1799                           }
1800                         op = gimple_call_lhs (stmt);
1801                         /* Ignore #pragma omp declare simd functions
1802                            if they don't have data references in the
1803                            call stmt itself.  */
1804                         if (j == n
1805                             && !(op
1806                                  && (DECL_P (op)
1807                                      || (REFERENCE_CLASS_P (op)
1808                                          && get_base_address (op)))))
1809                           continue;
1810                       }
1811                   }
1812               }
1813             return false;
1814           }
1815       }
1816   return true;
1817 }
1818
1819 /* Function vect_analyze_loop_2.
1820
1821    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1822    for it.  The different analyses will record information in the
1823    loop_vec_info struct.  */
1824 static bool
1825 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1826 {
1827   bool ok;
1828   int res;
1829   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1830   poly_uint64 min_vf = 2;
1831
1832   /* The first group of checks is independent of the vector size.  */
1833   fatal = true;
1834
1835   /* Find all data references in the loop (which correspond to vdefs/vuses)
1836      and analyze their evolution in the loop.  */
1837
1838   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1839   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1840     {
1841       if (dump_enabled_p ())
1842         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1843                          "not vectorized: loop nest containing two "
1844                          "or more consecutive inner loops cannot be "
1845                          "vectorized\n");
1846       return false;
1847     }
1848
1849   /* Gather the data references and count stmts in the loop.  */
1850   unsigned int n_stmts;
1851   if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1852                                   &LOOP_VINFO_DATAREFS (loop_vinfo),
1853                                   &n_stmts))
1854     {
1855       if (dump_enabled_p ())
1856         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1857                          "not vectorized: loop contains function "
1858                          "calls or data references that cannot "
1859                          "be analyzed\n");
1860       return false;
1861     }
1862
1863   /* Analyze the data references and also adjust the minimal
1864      vectorization factor according to the loads and stores.  */
1865
1866   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1867   if (!ok)
1868     {
1869       if (dump_enabled_p ())
1870         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1871                          "bad data references.\n");
1872       return false;
1873     }
1874
1875   /* Classify all cross-iteration scalar data-flow cycles.
1876      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1877   vect_analyze_scalar_cycles (loop_vinfo);
1878
1879   vect_pattern_recog (loop_vinfo);
1880
1881   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1882
1883   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1884      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1885
1886   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1887   if (!ok)
1888     {
1889       if (dump_enabled_p ())
1890         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891                          "bad data access.\n");
1892       return false;
1893     }
1894
1895   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1896
1897   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1898   if (!ok)
1899     {
1900       if (dump_enabled_p ())
1901         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1902                          "unexpected pattern.\n");
1903       return false;
1904     }
1905
1906   /* While the rest of the analysis below depends on it in some way.  */
1907   fatal = false;
1908
1909   /* Analyze data dependences between the data-refs in the loop
1910      and adjust the maximum vectorization factor according to
1911      the dependences.
1912      FORNOW: fail at the first data dependence that we encounter.  */
1913
1914   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1915   if (!ok
1916       || (max_vf != MAX_VECTORIZATION_FACTOR
1917           && maybe_lt (max_vf, min_vf)))
1918     {
1919       if (dump_enabled_p ())
1920             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1921                              "bad data dependence.\n");
1922       return false;
1923     }
1924   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1925
1926   ok = vect_determine_vectorization_factor (loop_vinfo);
1927   if (!ok)
1928     {
1929       if (dump_enabled_p ())
1930         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931                          "can't determine vectorization factor.\n");
1932       return false;
1933     }
1934   if (max_vf != MAX_VECTORIZATION_FACTOR
1935       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1936     {
1937       if (dump_enabled_p ())
1938         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1939                          "bad data dependence.\n");
1940       return false;
1941     }
1942
1943   /* Compute the scalar iteration cost.  */
1944   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1945
1946   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1947   unsigned th;
1948
1949   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1950   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1951   if (!ok)
1952     return false;
1953
1954   /* If there are any SLP instances mark them as pure_slp.  */
1955   bool slp = vect_make_slp_decision (loop_vinfo);
1956   if (slp)
1957     {
1958       /* Find stmts that need to be both vectorized and SLPed.  */
1959       vect_detect_hybrid_slp (loop_vinfo);
1960
1961       /* Update the vectorization factor based on the SLP decision.  */
1962       vect_update_vf_for_slp (loop_vinfo);
1963     }
1964
1965   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1966
1967   /* We don't expect to have to roll back to anything other than an empty
1968      set of rgroups.  */
1969   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1970
1971   /* This is the point where we can re-start analysis with SLP forced off.  */
1972 start_over:
1973
1974   /* Now the vectorization factor is final.  */
1975   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1976   gcc_assert (known_ne (vectorization_factor, 0U));
1977
1978   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1979     {
1980       dump_printf_loc (MSG_NOTE, vect_location,
1981                        "vectorization_factor = ");
1982       dump_dec (MSG_NOTE, vectorization_factor);
1983       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1984                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1985     }
1986
1987   HOST_WIDE_INT max_niter
1988     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1989
1990   /* Analyze the alignment of the data-refs in the loop.
1991      Fail if a data reference is found that cannot be vectorized.  */
1992
1993   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1994   if (!ok)
1995     {
1996       if (dump_enabled_p ())
1997         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1998                          "bad data alignment.\n");
1999       return false;
2000     }
2001
2002   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2003      It is important to call pruning after vect_analyze_data_ref_accesses,
2004      since we use grouping information gathered by interleaving analysis.  */
2005   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2006   if (!ok)
2007     return false;
2008
2009   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2010      vectorization.  */
2011   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2012     {
2013     /* This pass will decide on using loop versioning and/or loop peeling in
2014        order to enhance the alignment of data references in the loop.  */
2015     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2016     if (!ok)
2017       {
2018         if (dump_enabled_p ())
2019           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020                            "bad data alignment.\n");
2021         return false;
2022       }
2023     }
2024
2025   if (slp)
2026     {
2027       /* Analyze operations in the SLP instances.  Note this may
2028          remove unsupported SLP instances which makes the above
2029          SLP kind detection invalid.  */
2030       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2031       vect_slp_analyze_operations (loop_vinfo);
2032       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2033         goto again;
2034     }
2035
2036   /* Scan all the remaining operations in the loop that are not subject
2037      to SLP and make sure they are vectorizable.  */
2038   ok = vect_analyze_loop_operations (loop_vinfo);
2039   if (!ok)
2040     {
2041       if (dump_enabled_p ())
2042         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2043                          "bad operation or unsupported loop bound.\n");
2044       return false;
2045     }
2046
2047   /* Decide whether to use a fully-masked loop for this vectorization
2048      factor.  */
2049   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2050     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2051        && vect_verify_full_masking (loop_vinfo));
2052   if (dump_enabled_p ())
2053     {
2054       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2055         dump_printf_loc (MSG_NOTE, vect_location,
2056                          "using a fully-masked loop.\n");
2057       else
2058         dump_printf_loc (MSG_NOTE, vect_location,
2059                          "not using a fully-masked loop.\n");
2060     }
2061
2062   /* If epilog loop is required because of data accesses with gaps,
2063      one additional iteration needs to be peeled.  Check if there is
2064      enough iterations for vectorization.  */
2065   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2066       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2067       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2068     {
2069       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2070       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2071
2072       if (known_lt (wi::to_widest (scalar_niters), vf))
2073         {
2074           if (dump_enabled_p ())
2075             dump_printf_loc (MSG_NOTE, vect_location,
2076                              "loop has no enough iterations to support"
2077                              " peeling for gaps.\n");
2078           return false;
2079         }
2080     }
2081
2082   /* Check the costings of the loop make vectorizing worthwhile.  */
2083   res = vect_analyze_loop_costing (loop_vinfo);
2084   if (res < 0)
2085     goto again;
2086   if (!res)
2087     {
2088       if (dump_enabled_p ())
2089         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2090                          "Loop costings not worthwhile.\n");
2091       return false;
2092     }
2093
2094   /* Decide whether we need to create an epilogue loop to handle
2095      remaining scalar iterations.  */
2096   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2097
2098   unsigned HOST_WIDE_INT const_vf;
2099   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2100     /* The main loop handles all iterations.  */
2101     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2102   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2103            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2104     {
2105       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2106                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2107                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2108         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2109     }
2110   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2111            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2112            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2113                 < (unsigned) exact_log2 (const_vf))
2114                /* In case of versioning, check if the maximum number of
2115                   iterations is greater than th.  If they are identical,
2116                   the epilogue is unnecessary.  */
2117                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2118                    || ((unsigned HOST_WIDE_INT) max_niter
2119                        > (th / const_vf) * const_vf))))
2120     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2121
2122   /* If an epilogue loop is required make sure we can create one.  */
2123   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2124       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2125     {
2126       if (dump_enabled_p ())
2127         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2128       if (!vect_can_advance_ivs_p (loop_vinfo)
2129           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2130                                            single_exit (LOOP_VINFO_LOOP
2131                                                          (loop_vinfo))))
2132         {
2133           if (dump_enabled_p ())
2134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2135                              "not vectorized: can't create required "
2136                              "epilog loop\n");
2137           goto again;
2138         }
2139     }
2140
2141   /* During peeling, we need to check if number of loop iterations is
2142      enough for both peeled prolog loop and vector loop.  This check
2143      can be merged along with threshold check of loop versioning, so
2144      increase threshold for this case if necessary.  */
2145   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2146     {
2147       poly_uint64 niters_th = 0;
2148
2149       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2150         {
2151           /* Niters for peeled prolog loop.  */
2152           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2153             {
2154               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2155               tree vectype
2156                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2157               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2158             }
2159           else
2160             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2161         }
2162
2163       /* Niters for at least one iteration of vectorized loop.  */
2164       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2165         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2166       /* One additional iteration because of peeling for gap.  */
2167       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2168         niters_th += 1;
2169       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2170     }
2171
2172   gcc_assert (known_eq (vectorization_factor,
2173                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2174
2175   /* Ok to vectorize!  */
2176   return true;
2177
2178 again:
2179   /* Try again with SLP forced off but if we didn't do any SLP there is
2180      no point in re-trying.  */
2181   if (!slp)
2182     return false;
2183
2184   /* If there are reduction chains re-trying will fail anyway.  */
2185   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2186     return false;
2187
2188   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2189      via interleaving or lane instructions.  */
2190   slp_instance instance;
2191   slp_tree node;
2192   unsigned i, j;
2193   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2194     {
2195       stmt_vec_info vinfo;
2196       vinfo = vinfo_for_stmt
2197           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2198       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2199         continue;
2200       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2201       unsigned int size = DR_GROUP_SIZE (vinfo);
2202       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2203       if (! vect_store_lanes_supported (vectype, size, false)
2204          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2205          && ! vect_grouped_store_supported (vectype, size))
2206        return false;
2207       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2208         {
2209           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2210           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2211           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2212           size = DR_GROUP_SIZE (vinfo);
2213           vectype = STMT_VINFO_VECTYPE (vinfo);
2214           if (! vect_load_lanes_supported (vectype, size, false)
2215               && ! vect_grouped_load_supported (vectype, single_element_p,
2216                                                 size))
2217             return false;
2218         }
2219     }
2220
2221   if (dump_enabled_p ())
2222     dump_printf_loc (MSG_NOTE, vect_location,
2223                      "re-trying with SLP disabled\n");
2224
2225   /* Roll back state appropriately.  No SLP this time.  */
2226   slp = false;
2227   /* Restore vectorization factor as it were without SLP.  */
2228   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2229   /* Free the SLP instances.  */
2230   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2231     vect_free_slp_instance (instance);
2232   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2233   /* Reset SLP type to loop_vect on all stmts.  */
2234   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2235     {
2236       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2237       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2238            !gsi_end_p (si); gsi_next (&si))
2239         {
2240           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2241           STMT_SLP_TYPE (stmt_info) = loop_vect;
2242         }
2243       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2244            !gsi_end_p (si); gsi_next (&si))
2245         {
2246           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2247           STMT_SLP_TYPE (stmt_info) = loop_vect;
2248           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2249             {
2250               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2251               STMT_SLP_TYPE (stmt_info) = loop_vect;
2252               for (gimple_stmt_iterator pi
2253                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2254                    !gsi_end_p (pi); gsi_next (&pi))
2255                 {
2256                   gimple *pstmt = gsi_stmt (pi);
2257                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2258                 }
2259             }
2260         }
2261     }
2262   /* Free optimized alias test DDRS.  */
2263   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2264   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2265   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2266   /* Reset target cost data.  */
2267   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2268   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2269     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2270   /* Reset accumulated rgroup information.  */
2271   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2272   /* Reset assorted flags.  */
2273   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2274   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2275   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2276   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2277   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2278
2279   goto start_over;
2280 }
2281
2282 /* Function vect_analyze_loop.
2283
2284    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2285    for it.  The different analyses will record information in the
2286    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2287    be vectorized.  */
2288 loop_vec_info
2289 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2290 {
2291   loop_vec_info loop_vinfo;
2292   auto_vector_sizes vector_sizes;
2293
2294   /* Autodetect first vector size we try.  */
2295   current_vector_size = 0;
2296   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2297   unsigned int next_size = 0;
2298
2299   DUMP_VECT_SCOPE ("analyze_loop_nest");
2300
2301   if (loop_outer (loop)
2302       && loop_vec_info_for_loop (loop_outer (loop))
2303       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2304     {
2305       if (dump_enabled_p ())
2306         dump_printf_loc (MSG_NOTE, vect_location,
2307                          "outer-loop already vectorized.\n");
2308       return NULL;
2309     }
2310
2311   poly_uint64 autodetected_vector_size = 0;
2312   while (1)
2313     {
2314       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2315       loop_vinfo = vect_analyze_loop_form (loop);
2316       if (!loop_vinfo)
2317         {
2318           if (dump_enabled_p ())
2319             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320                              "bad loop form.\n");
2321           return NULL;
2322         }
2323
2324       bool fatal = false;
2325
2326       if (orig_loop_vinfo)
2327         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2328
2329       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2330         {
2331           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2332
2333           return loop_vinfo;
2334         }
2335
2336       delete loop_vinfo;
2337
2338       if (next_size == 0)
2339         autodetected_vector_size = current_vector_size;
2340
2341       if (next_size < vector_sizes.length ()
2342           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2343         next_size += 1;
2344
2345       if (fatal
2346           || next_size == vector_sizes.length ()
2347           || known_eq (current_vector_size, 0U))
2348         return NULL;
2349
2350       /* Try the next biggest vector size.  */
2351       current_vector_size = vector_sizes[next_size++];
2352       if (dump_enabled_p ())
2353         {
2354           dump_printf_loc (MSG_NOTE, vect_location,
2355                            "***** Re-trying analysis with "
2356                            "vector size ");
2357           dump_dec (MSG_NOTE, current_vector_size);
2358           dump_printf (MSG_NOTE, "\n");
2359         }
2360     }
2361 }
2362
2363 /* Return true if there is an in-order reduction function for CODE, storing
2364    it in *REDUC_FN if so.  */
2365
2366 static bool
2367 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2368 {
2369   switch (code)
2370     {
2371     case PLUS_EXPR:
2372       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2373       return true;
2374
2375     default:
2376       return false;
2377     }
2378 }
2379
2380 /* Function reduction_fn_for_scalar_code
2381
2382    Input:
2383    CODE - tree_code of a reduction operations.
2384
2385    Output:
2386    REDUC_FN - the corresponding internal function to be used to reduce the
2387       vector of partial results into a single scalar result, or IFN_LAST
2388       if the operation is a supported reduction operation, but does not have
2389       such an internal function.
2390
2391    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2392
2393 static bool
2394 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2395 {
2396   switch (code)
2397     {
2398       case MAX_EXPR:
2399         *reduc_fn = IFN_REDUC_MAX;
2400         return true;
2401
2402       case MIN_EXPR:
2403         *reduc_fn = IFN_REDUC_MIN;
2404         return true;
2405
2406       case PLUS_EXPR:
2407         *reduc_fn = IFN_REDUC_PLUS;
2408         return true;
2409
2410       case BIT_AND_EXPR:
2411         *reduc_fn = IFN_REDUC_AND;
2412         return true;
2413
2414       case BIT_IOR_EXPR:
2415         *reduc_fn = IFN_REDUC_IOR;
2416         return true;
2417
2418       case BIT_XOR_EXPR:
2419         *reduc_fn = IFN_REDUC_XOR;
2420         return true;
2421
2422       case MULT_EXPR:
2423       case MINUS_EXPR:
2424         *reduc_fn = IFN_LAST;
2425         return true;
2426
2427       default:
2428        return false;
2429     }
2430 }
2431
2432 /* If there is a neutral value X such that SLP reduction NODE would not
2433    be affected by the introduction of additional X elements, return that X,
2434    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2435    is true if the SLP statements perform a single reduction, false if each
2436    statement performs an independent reduction.  */
2437
2438 static tree
2439 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2440                               bool reduc_chain)
2441 {
2442   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2443   gimple *stmt = stmts[0];
2444   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2445   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2446   tree scalar_type = TREE_TYPE (vector_type);
2447   struct loop *loop = gimple_bb (stmt)->loop_father;
2448   gcc_assert (loop);
2449
2450   switch (code)
2451     {
2452     case WIDEN_SUM_EXPR:
2453     case DOT_PROD_EXPR:
2454     case SAD_EXPR:
2455     case PLUS_EXPR:
2456     case MINUS_EXPR:
2457     case BIT_IOR_EXPR:
2458     case BIT_XOR_EXPR:
2459       return build_zero_cst (scalar_type);
2460
2461     case MULT_EXPR:
2462       return build_one_cst (scalar_type);
2463
2464     case BIT_AND_EXPR:
2465       return build_all_ones_cst (scalar_type);
2466
2467     case MAX_EXPR:
2468     case MIN_EXPR:
2469       /* For MIN/MAX the initial values are neutral.  A reduction chain
2470          has only a single initial value, so that value is neutral for
2471          all statements.  */
2472       if (reduc_chain)
2473         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2474       return NULL_TREE;
2475
2476     default:
2477       return NULL_TREE;
2478     }
2479 }
2480
2481 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2482    STMT is printed with a message MSG. */
2483
2484 static void
2485 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2486 {
2487   dump_printf_loc (msg_type, vect_location, "%s", msg);
2488   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2489 }
2490
2491
2492 /* Detect SLP reduction of the form:
2493
2494    #a1 = phi <a5, a0>
2495    a2 = operation (a1)
2496    a3 = operation (a2)
2497    a4 = operation (a3)
2498    a5 = operation (a4)
2499
2500    #a = phi <a5>
2501
2502    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2503    FIRST_STMT is the first reduction stmt in the chain
2504    (a2 = operation (a1)).
2505
2506    Return TRUE if a reduction chain was detected.  */
2507
2508 static bool
2509 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2510                        gimple *first_stmt)
2511 {
2512   struct loop *loop = (gimple_bb (phi))->loop_father;
2513   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2514   enum tree_code code;
2515   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2516   stmt_vec_info use_stmt_info, current_stmt_info;
2517   tree lhs;
2518   imm_use_iterator imm_iter;
2519   use_operand_p use_p;
2520   int nloop_uses, size = 0, n_out_of_loop_uses;
2521   bool found = false;
2522
2523   if (loop != vect_loop)
2524     return false;
2525
2526   lhs = PHI_RESULT (phi);
2527   code = gimple_assign_rhs_code (first_stmt);
2528   while (1)
2529     {
2530       nloop_uses = 0;
2531       n_out_of_loop_uses = 0;
2532       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2533         {
2534           gimple *use_stmt = USE_STMT (use_p);
2535           if (is_gimple_debug (use_stmt))
2536             continue;
2537
2538           /* Check if we got back to the reduction phi.  */
2539           if (use_stmt == phi)
2540             {
2541               loop_use_stmt = use_stmt;
2542               found = true;
2543               break;
2544             }
2545
2546           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2547             {
2548               loop_use_stmt = use_stmt;
2549               nloop_uses++;
2550             }
2551            else
2552              n_out_of_loop_uses++;
2553
2554            /* There are can be either a single use in the loop or two uses in
2555               phi nodes.  */
2556            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2557              return false;
2558         }
2559
2560       if (found)
2561         break;
2562
2563       /* We reached a statement with no loop uses.  */
2564       if (nloop_uses == 0)
2565         return false;
2566
2567       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2568       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2569         return false;
2570
2571       if (!is_gimple_assign (loop_use_stmt)
2572           || code != gimple_assign_rhs_code (loop_use_stmt)
2573           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2574         return false;
2575
2576       /* Insert USE_STMT into reduction chain.  */
2577       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2578       if (current_stmt)
2579         {
2580           current_stmt_info = vinfo_for_stmt (current_stmt);
2581           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2582           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2583             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2584         }
2585       else
2586         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2587
2588       lhs = gimple_assign_lhs (loop_use_stmt);
2589       current_stmt = loop_use_stmt;
2590       size++;
2591    }
2592
2593   if (!found || loop_use_stmt != phi || size < 2)
2594     return false;
2595
2596   /* Swap the operands, if needed, to make the reduction operand be the second
2597      operand.  */
2598   lhs = PHI_RESULT (phi);
2599   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2600   while (next_stmt)
2601     {
2602       if (gimple_assign_rhs2 (next_stmt) == lhs)
2603         {
2604           tree op = gimple_assign_rhs1 (next_stmt);
2605           gimple *def_stmt = NULL;
2606
2607           if (TREE_CODE (op) == SSA_NAME)
2608             def_stmt = SSA_NAME_DEF_STMT (op);
2609
2610           /* Check that the other def is either defined in the loop
2611              ("vect_internal_def"), or it's an induction (defined by a
2612              loop-header phi-node).  */
2613           if (def_stmt
2614               && gimple_bb (def_stmt)
2615               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2616               && (is_gimple_assign (def_stmt)
2617                   || is_gimple_call (def_stmt)
2618                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2619                            == vect_induction_def
2620                   || (gimple_code (def_stmt) == GIMPLE_PHI
2621                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2622                                   == vect_internal_def
2623                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2624             {
2625               lhs = gimple_assign_lhs (next_stmt);
2626               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2627               continue;
2628             }
2629
2630           return false;
2631         }
2632       else
2633         {
2634           tree op = gimple_assign_rhs2 (next_stmt);
2635           gimple *def_stmt = NULL;
2636
2637           if (TREE_CODE (op) == SSA_NAME)
2638             def_stmt = SSA_NAME_DEF_STMT (op);
2639
2640           /* Check that the other def is either defined in the loop
2641             ("vect_internal_def"), or it's an induction (defined by a
2642             loop-header phi-node).  */
2643           if (def_stmt
2644               && gimple_bb (def_stmt)
2645               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2646               && (is_gimple_assign (def_stmt)
2647                   || is_gimple_call (def_stmt)
2648                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2649                               == vect_induction_def
2650                   || (gimple_code (def_stmt) == GIMPLE_PHI
2651                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2652                                   == vect_internal_def
2653                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2654             {
2655               if (dump_enabled_p ())
2656                 {
2657                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2658                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2659                 }
2660
2661               swap_ssa_operands (next_stmt,
2662                                  gimple_assign_rhs1_ptr (next_stmt),
2663                                  gimple_assign_rhs2_ptr (next_stmt));
2664               update_stmt (next_stmt);
2665
2666               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2667                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2668             }
2669           else
2670             return false;
2671         }
2672
2673       lhs = gimple_assign_lhs (next_stmt);
2674       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2675     }
2676
2677   /* Save the chain for further analysis in SLP detection.  */
2678   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2679   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2680   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2681
2682   return true;
2683 }
2684
2685 /* Return true if we need an in-order reduction for operation CODE
2686    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2687    overflow must wrap.  */
2688
2689 static bool
2690 needs_fold_left_reduction_p (tree type, tree_code code,
2691                              bool need_wrapping_integral_overflow)
2692 {
2693   /* CHECKME: check for !flag_finite_math_only too?  */
2694   if (SCALAR_FLOAT_TYPE_P (type))
2695     switch (code)
2696       {
2697       case MIN_EXPR:
2698       case MAX_EXPR:
2699         return false;
2700
2701       default:
2702         return !flag_associative_math;
2703       }
2704
2705   if (INTEGRAL_TYPE_P (type))
2706     {
2707       if (!operation_no_trapping_overflow (type, code))
2708         return true;
2709       if (need_wrapping_integral_overflow
2710           && !TYPE_OVERFLOW_WRAPS (type)
2711           && operation_can_overflow (code))
2712         return true;
2713       return false;
2714     }
2715
2716   if (SAT_FIXED_POINT_TYPE_P (type))
2717     return true;
2718
2719   return false;
2720 }
2721
2722 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2723    reduction operation CODE has a handled computation expression.  */
2724
2725 bool
2726 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2727                       enum tree_code code)
2728 {
2729   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2730   auto_bitmap visited;
2731   tree lookfor = PHI_RESULT (phi);
2732   ssa_op_iter curri;
2733   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2734   while (USE_FROM_PTR (curr) != loop_arg)
2735     curr = op_iter_next_use (&curri);
2736   curri.i = curri.numops;
2737   do
2738     {
2739       path.safe_push (std::make_pair (curri, curr));
2740       tree use = USE_FROM_PTR (curr);
2741       if (use == lookfor)
2742         break;
2743       gimple *def = SSA_NAME_DEF_STMT (use);
2744       if (gimple_nop_p (def)
2745           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2746         {
2747 pop:
2748           do
2749             {
2750               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2751               curri = x.first;
2752               curr = x.second;
2753               do
2754                 curr = op_iter_next_use (&curri);
2755               /* Skip already visited or non-SSA operands (from iterating
2756                  over PHI args).  */
2757               while (curr != NULL_USE_OPERAND_P
2758                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2759                          || ! bitmap_set_bit (visited,
2760                                               SSA_NAME_VERSION
2761                                                 (USE_FROM_PTR (curr)))));
2762             }
2763           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2764           if (curr == NULL_USE_OPERAND_P)
2765             break;
2766         }
2767       else
2768         {
2769           if (gimple_code (def) == GIMPLE_PHI)
2770             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2771           else
2772             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2773           while (curr != NULL_USE_OPERAND_P
2774                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2775                      || ! bitmap_set_bit (visited,
2776                                           SSA_NAME_VERSION
2777                                             (USE_FROM_PTR (curr)))))
2778             curr = op_iter_next_use (&curri);
2779           if (curr == NULL_USE_OPERAND_P)
2780             goto pop;
2781         }
2782     }
2783   while (1);
2784   if (dump_file && (dump_flags & TDF_DETAILS))
2785     {
2786       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2787       unsigned i;
2788       std::pair<ssa_op_iter, use_operand_p> *x;
2789       FOR_EACH_VEC_ELT (path, i, x)
2790         {
2791           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2792           dump_printf (MSG_NOTE, " ");
2793         }
2794       dump_printf (MSG_NOTE, "\n");
2795     }
2796
2797   /* Check whether the reduction path detected is valid.  */
2798   bool fail = path.length () == 0;
2799   bool neg = false;
2800   for (unsigned i = 1; i < path.length (); ++i)
2801     {
2802       gimple *use_stmt = USE_STMT (path[i].second);
2803       tree op = USE_FROM_PTR (path[i].second);
2804       if (! has_single_use (op)
2805           || ! is_gimple_assign (use_stmt))
2806         {
2807           fail = true;
2808           break;
2809         }
2810       if (gimple_assign_rhs_code (use_stmt) != code)
2811         {
2812           if (code == PLUS_EXPR
2813               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2814             {
2815               /* Track whether we negate the reduction value each iteration.  */
2816               if (gimple_assign_rhs2 (use_stmt) == op)
2817                 neg = ! neg;
2818             }
2819           else
2820             {
2821               fail = true;
2822               break;
2823             }
2824         }
2825     }
2826   return ! fail && ! neg;
2827 }
2828
2829
2830 /* Function vect_is_simple_reduction
2831
2832    (1) Detect a cross-iteration def-use cycle that represents a simple
2833    reduction computation.  We look for the following pattern:
2834
2835    loop_header:
2836      a1 = phi < a0, a2 >
2837      a3 = ...
2838      a2 = operation (a3, a1)
2839
2840    or
2841
2842    a3 = ...
2843    loop_header:
2844      a1 = phi < a0, a2 >
2845      a2 = operation (a3, a1)
2846
2847    such that:
2848    1. operation is commutative and associative and it is safe to
2849       change the order of the computation
2850    2. no uses for a2 in the loop (a2 is used out of the loop)
2851    3. no uses of a1 in the loop besides the reduction operation
2852    4. no uses of a1 outside the loop.
2853
2854    Conditions 1,4 are tested here.
2855    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2856
2857    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2858    nested cycles.
2859
2860    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2861    reductions:
2862
2863      a1 = phi < a0, a2 >
2864      inner loop (def of a3)
2865      a2 = phi < a3 >
2866
2867    (4) Detect condition expressions, ie:
2868      for (int i = 0; i < N; i++)
2869        if (a[i] < val)
2870         ret_val = a[i];
2871
2872 */
2873
2874 static gimple *
2875 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2876                           bool *double_reduc,
2877                           bool need_wrapping_integral_overflow,
2878                           enum vect_reduction_type *v_reduc_type)
2879 {
2880   struct loop *loop = (gimple_bb (phi))->loop_father;
2881   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2882   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2883   enum tree_code orig_code, code;
2884   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2885   tree type;
2886   int nloop_uses;
2887   tree name;
2888   imm_use_iterator imm_iter;
2889   use_operand_p use_p;
2890   bool phi_def;
2891
2892   *double_reduc = false;
2893   *v_reduc_type = TREE_CODE_REDUCTION;
2894
2895   tree phi_name = PHI_RESULT (phi);
2896   /* ???  If there are no uses of the PHI result the inner loop reduction
2897      won't be detected as possibly double-reduction by vectorizable_reduction
2898      because that tries to walk the PHI arg from the preheader edge which
2899      can be constant.  See PR60382.  */
2900   if (has_zero_uses (phi_name))
2901     return NULL;
2902   nloop_uses = 0;
2903   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2904     {
2905       gimple *use_stmt = USE_STMT (use_p);
2906       if (is_gimple_debug (use_stmt))
2907         continue;
2908
2909       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2910         {
2911           if (dump_enabled_p ())
2912             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2913                              "intermediate value used outside loop.\n");
2914
2915           return NULL;
2916         }
2917
2918       nloop_uses++;
2919       if (nloop_uses > 1)
2920         {
2921           if (dump_enabled_p ())
2922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2923                              "reduction value used in loop.\n");
2924           return NULL;
2925         }
2926
2927       phi_use_stmt = use_stmt;
2928     }
2929
2930   edge latch_e = loop_latch_edge (loop);
2931   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2932   if (TREE_CODE (loop_arg) != SSA_NAME)
2933     {
2934       if (dump_enabled_p ())
2935         {
2936           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2937                            "reduction: not ssa_name: ");
2938           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2939           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2940         }
2941       return NULL;
2942     }
2943
2944   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2945   if (is_gimple_assign (def_stmt))
2946     {
2947       name = gimple_assign_lhs (def_stmt);
2948       phi_def = false;
2949     }
2950   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2951     {
2952       name = PHI_RESULT (def_stmt);
2953       phi_def = true;
2954     }
2955   else
2956     {
2957       if (dump_enabled_p ())
2958         {
2959           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2960                            "reduction: unhandled reduction operation: ");
2961           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2962         }
2963       return NULL;
2964     }
2965
2966   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2967     return NULL;
2968
2969   nloop_uses = 0;
2970   auto_vec<gphi *, 3> lcphis;
2971   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2972     {
2973       gimple *use_stmt = USE_STMT (use_p);
2974       if (is_gimple_debug (use_stmt))
2975         continue;
2976       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2977         nloop_uses++;
2978       else
2979         /* We can have more than one loop-closed PHI.  */
2980         lcphis.safe_push (as_a <gphi *> (use_stmt));
2981       if (nloop_uses > 1)
2982         {
2983           if (dump_enabled_p ())
2984             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2985                              "reduction used in loop.\n");
2986           return NULL;
2987         }
2988     }
2989
2990   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2991      defined in the inner loop.  */
2992   if (phi_def)
2993     {
2994       op1 = PHI_ARG_DEF (def_stmt, 0);
2995
2996       if (gimple_phi_num_args (def_stmt) != 1
2997           || TREE_CODE (op1) != SSA_NAME)
2998         {
2999           if (dump_enabled_p ())
3000             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001                              "unsupported phi node definition.\n");
3002
3003           return NULL;
3004         }
3005
3006       def1 = SSA_NAME_DEF_STMT (op1);
3007       if (gimple_bb (def1)
3008           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3009           && loop->inner
3010           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3011           && is_gimple_assign (def1)
3012           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3013         {
3014           if (dump_enabled_p ())
3015             report_vect_op (MSG_NOTE, def_stmt,
3016                             "detected double reduction: ");
3017
3018           *double_reduc = true;
3019           return def_stmt;
3020         }
3021
3022       return NULL;
3023     }
3024
3025   /* If we are vectorizing an inner reduction we are executing that
3026      in the original order only in case we are not dealing with a
3027      double reduction.  */
3028   bool check_reduction = true;
3029   if (flow_loop_nested_p (vect_loop, loop))
3030     {
3031       gphi *lcphi;
3032       unsigned i;
3033       check_reduction = false;
3034       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3035         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3036           {
3037             gimple *use_stmt = USE_STMT (use_p);
3038             if (is_gimple_debug (use_stmt))
3039               continue;
3040             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3041               check_reduction = true;
3042           }
3043     }
3044
3045   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3046   code = orig_code = gimple_assign_rhs_code (def_stmt);
3047
3048   /* We can handle "res -= x[i]", which is non-associative by
3049      simply rewriting this into "res += -x[i]".  Avoid changing
3050      gimple instruction for the first simple tests and only do this
3051      if we're allowed to change code at all.  */
3052   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3053     code = PLUS_EXPR;
3054
3055   if (code == COND_EXPR)
3056     {
3057       if (! nested_in_vect_loop)
3058         *v_reduc_type = COND_REDUCTION;
3059
3060       op3 = gimple_assign_rhs1 (def_stmt);
3061       if (COMPARISON_CLASS_P (op3))
3062         {
3063           op4 = TREE_OPERAND (op3, 1);
3064           op3 = TREE_OPERAND (op3, 0);
3065         }
3066       if (op3 == phi_name || op4 == phi_name)
3067         {
3068           if (dump_enabled_p ())
3069             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3070                             "reduction: condition depends on previous"
3071                             " iteration: ");
3072           return NULL;
3073         }
3074
3075       op1 = gimple_assign_rhs2 (def_stmt);
3076       op2 = gimple_assign_rhs3 (def_stmt);
3077     }
3078   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3079     {
3080       if (dump_enabled_p ())
3081         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3082                         "reduction: not commutative/associative: ");
3083       return NULL;
3084     }
3085   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3086     {
3087       op1 = gimple_assign_rhs1 (def_stmt);
3088       op2 = gimple_assign_rhs2 (def_stmt);
3089     }
3090   else
3091     {
3092       if (dump_enabled_p ())
3093         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3094                         "reduction: not handled operation: ");
3095       return NULL;
3096     }
3097
3098   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3099     {
3100       if (dump_enabled_p ())
3101         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3102                         "reduction: both uses not ssa_names: ");
3103
3104       return NULL;
3105     }
3106
3107   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3108   if ((TREE_CODE (op1) == SSA_NAME
3109        && !types_compatible_p (type,TREE_TYPE (op1)))
3110       || (TREE_CODE (op2) == SSA_NAME
3111           && !types_compatible_p (type, TREE_TYPE (op2)))
3112       || (op3 && TREE_CODE (op3) == SSA_NAME
3113           && !types_compatible_p (type, TREE_TYPE (op3)))
3114       || (op4 && TREE_CODE (op4) == SSA_NAME
3115           && !types_compatible_p (type, TREE_TYPE (op4))))
3116     {
3117       if (dump_enabled_p ())
3118         {
3119           dump_printf_loc (MSG_NOTE, vect_location,
3120                            "reduction: multiple types: operation type: ");
3121           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3122           dump_printf (MSG_NOTE, ", operands types: ");
3123           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3124                              TREE_TYPE (op1));
3125           dump_printf (MSG_NOTE, ",");
3126           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3127                              TREE_TYPE (op2));
3128           if (op3)
3129             {
3130               dump_printf (MSG_NOTE, ",");
3131               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3132                                  TREE_TYPE (op3));
3133             }
3134
3135           if (op4)
3136             {
3137               dump_printf (MSG_NOTE, ",");
3138               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3139                                  TREE_TYPE (op4));
3140             }
3141           dump_printf (MSG_NOTE, "\n");
3142         }
3143
3144       return NULL;
3145     }
3146
3147   /* Check whether it's ok to change the order of the computation.
3148      Generally, when vectorizing a reduction we change the order of the
3149      computation.  This may change the behavior of the program in some
3150      cases, so we need to check that this is ok.  One exception is when
3151      vectorizing an outer-loop: the inner-loop is executed sequentially,
3152      and therefore vectorizing reductions in the inner-loop during
3153      outer-loop vectorization is safe.  */
3154   if (check_reduction
3155       && *v_reduc_type == TREE_CODE_REDUCTION
3156       && needs_fold_left_reduction_p (type, code,
3157                                       need_wrapping_integral_overflow))
3158     *v_reduc_type = FOLD_LEFT_REDUCTION;
3159
3160   /* Reduction is safe. We're dealing with one of the following:
3161      1) integer arithmetic and no trapv
3162      2) floating point arithmetic, and special flags permit this optimization
3163      3) nested cycle (i.e., outer loop vectorization).  */
3164   if (TREE_CODE (op1) == SSA_NAME)
3165     def1 = SSA_NAME_DEF_STMT (op1);
3166
3167   if (TREE_CODE (op2) == SSA_NAME)
3168     def2 = SSA_NAME_DEF_STMT (op2);
3169
3170   if (code != COND_EXPR
3171       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3172     {
3173       if (dump_enabled_p ())
3174         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3175       return NULL;
3176     }
3177
3178   /* Check that one def is the reduction def, defined by PHI,
3179      the other def is either defined in the loop ("vect_internal_def"),
3180      or it's an induction (defined by a loop-header phi-node).  */
3181
3182   if (def2 && def2 == phi
3183       && (code == COND_EXPR
3184           || !def1 || gimple_nop_p (def1)
3185           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3186           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3187               && (is_gimple_assign (def1)
3188                   || is_gimple_call (def1)
3189                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3190                       == vect_induction_def
3191                   || (gimple_code (def1) == GIMPLE_PHI
3192                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3193                           == vect_internal_def
3194                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3195     {
3196       if (dump_enabled_p ())
3197         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3198       return def_stmt;
3199     }
3200
3201   if (def1 && def1 == phi
3202       && (code == COND_EXPR
3203           || !def2 || gimple_nop_p (def2)
3204           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3205           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3206               && (is_gimple_assign (def2)
3207                   || is_gimple_call (def2)
3208                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3209                        == vect_induction_def
3210                   || (gimple_code (def2) == GIMPLE_PHI
3211                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3212                            == vect_internal_def
3213                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3214     {
3215       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3216         {
3217           /* Check if we can swap operands (just for simplicity - so that
3218              the rest of the code can assume that the reduction variable
3219              is always the last (second) argument).  */
3220           if (code == COND_EXPR)
3221             {
3222               /* Swap cond_expr by inverting the condition.  */
3223               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3224               enum tree_code invert_code = ERROR_MARK;
3225               enum tree_code cond_code = TREE_CODE (cond_expr);
3226
3227               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3228                 {
3229                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3230                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3231                 }
3232               if (invert_code != ERROR_MARK)
3233                 {
3234                   TREE_SET_CODE (cond_expr, invert_code);
3235                   swap_ssa_operands (def_stmt,
3236                                      gimple_assign_rhs2_ptr (def_stmt),
3237                                      gimple_assign_rhs3_ptr (def_stmt));
3238                 }
3239               else
3240                 {
3241                   if (dump_enabled_p ())
3242                     report_vect_op (MSG_NOTE, def_stmt,
3243                                     "detected reduction: cannot swap operands "
3244                                     "for cond_expr");
3245                   return NULL;
3246                 }
3247             }
3248           else
3249             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3250                                gimple_assign_rhs2_ptr (def_stmt));
3251
3252           if (dump_enabled_p ())
3253             report_vect_op (MSG_NOTE, def_stmt,
3254                             "detected reduction: need to swap operands: ");
3255
3256           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3257             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3258         }
3259       else
3260         {
3261           if (dump_enabled_p ())
3262             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3263         }
3264
3265       return def_stmt;
3266     }
3267
3268   /* Try to find SLP reduction chain.  */
3269   if (! nested_in_vect_loop
3270       && code != COND_EXPR
3271       && orig_code != MINUS_EXPR
3272       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3273     {
3274       if (dump_enabled_p ())
3275         report_vect_op (MSG_NOTE, def_stmt,
3276                         "reduction: detected reduction chain: ");
3277
3278       return def_stmt;
3279     }
3280
3281   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3282   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3283   while (first)
3284     {
3285       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3286       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3287       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3288       first = next;
3289     }
3290
3291   /* Look for the expression computing loop_arg from loop PHI result.  */
3292   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3293                             code))
3294     return def_stmt;
3295
3296   if (dump_enabled_p ())
3297     {
3298       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3299                       "reduction: unknown pattern: ");
3300     }
3301
3302   return NULL;
3303 }
3304
3305 /* Wrapper around vect_is_simple_reduction, which will modify code
3306    in-place if it enables detection of more reductions.  Arguments
3307    as there.  */
3308
3309 gimple *
3310 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3311                              bool *double_reduc,
3312                              bool need_wrapping_integral_overflow)
3313 {
3314   enum vect_reduction_type v_reduc_type;
3315   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3316                                           need_wrapping_integral_overflow,
3317                                           &v_reduc_type);
3318   if (def)
3319     {
3320       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3321       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3322       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3323       reduc_def_info = vinfo_for_stmt (def);
3324       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3325       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3326     }
3327   return def;
3328 }
3329
3330 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3331 int
3332 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3333                              int *peel_iters_epilogue,
3334                              stmt_vector_for_cost *scalar_cost_vec,
3335                              stmt_vector_for_cost *prologue_cost_vec,
3336                              stmt_vector_for_cost *epilogue_cost_vec)
3337 {
3338   int retval = 0;
3339   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3340
3341   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3342     {
3343       *peel_iters_epilogue = assumed_vf / 2;
3344       if (dump_enabled_p ())
3345         dump_printf_loc (MSG_NOTE, vect_location,
3346                          "cost model: epilogue peel iters set to vf/2 "
3347                          "because loop iterations are unknown .\n");
3348
3349       /* If peeled iterations are known but number of scalar loop
3350          iterations are unknown, count a taken branch per peeled loop.  */
3351       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3352                                  NULL, 0, vect_prologue);
3353       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3354                                  NULL, 0, vect_epilogue);
3355     }
3356   else
3357     {
3358       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3359       peel_iters_prologue = niters < peel_iters_prologue ?
3360                             niters : peel_iters_prologue;
3361       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3362       /* If we need to peel for gaps, but no peeling is required, we have to
3363          peel VF iterations.  */
3364       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3365         *peel_iters_epilogue = assumed_vf;
3366     }
3367
3368   stmt_info_for_cost *si;
3369   int j;
3370   if (peel_iters_prologue)
3371     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3372         {
3373           stmt_vec_info stmt_info
3374             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3375           retval += record_stmt_cost (prologue_cost_vec,
3376                                       si->count * peel_iters_prologue,
3377                                       si->kind, stmt_info, si->misalign,
3378                                       vect_prologue);
3379         }
3380   if (*peel_iters_epilogue)
3381     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3382         {
3383           stmt_vec_info stmt_info
3384             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3385           retval += record_stmt_cost (epilogue_cost_vec,
3386                                       si->count * *peel_iters_epilogue,
3387                                       si->kind, stmt_info, si->misalign,
3388                                       vect_epilogue);
3389         }
3390
3391   return retval;
3392 }
3393
3394 /* Function vect_estimate_min_profitable_iters
3395
3396    Return the number of iterations required for the vector version of the
3397    loop to be profitable relative to the cost of the scalar version of the
3398    loop.
3399
3400    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3401    of iterations for vectorization.  -1 value means loop vectorization
3402    is not profitable.  This returned value may be used for dynamic
3403    profitability check.
3404
3405    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3406    for static check against estimated number of iterations.  */
3407
3408 static void
3409 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3410                                     int *ret_min_profitable_niters,
3411                                     int *ret_min_profitable_estimate)
3412 {
3413   int min_profitable_iters;
3414   int min_profitable_estimate;
3415   int peel_iters_prologue;
3416   int peel_iters_epilogue;
3417   unsigned vec_inside_cost = 0;
3418   int vec_outside_cost = 0;
3419   unsigned vec_prologue_cost = 0;
3420   unsigned vec_epilogue_cost = 0;
3421   int scalar_single_iter_cost = 0;
3422   int scalar_outside_cost = 0;
3423   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3424   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3425   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3426
3427   /* Cost model disabled.  */
3428   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3429     {
3430       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3431       *ret_min_profitable_niters = 0;
3432       *ret_min_profitable_estimate = 0;
3433       return;
3434     }
3435
3436   /* Requires loop versioning tests to handle misalignment.  */
3437   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3438     {
3439       /*  FIXME: Make cost depend on complexity of individual check.  */
3440       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3441       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3442                             vect_prologue);
3443       dump_printf (MSG_NOTE,
3444                    "cost model: Adding cost of checks for loop "
3445                    "versioning to treat misalignment.\n");
3446     }
3447
3448   /* Requires loop versioning with alias checks.  */
3449   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3450     {
3451       /*  FIXME: Make cost depend on complexity of individual check.  */
3452       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3453       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3454                             vect_prologue);
3455       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3456       if (len)
3457         /* Count LEN - 1 ANDs and LEN comparisons.  */
3458         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3459                               NULL, 0, vect_prologue);
3460       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3461       if (len)
3462         {
3463           /* Count LEN - 1 ANDs and LEN comparisons.  */
3464           unsigned int nstmts = len * 2 - 1;
3465           /* +1 for each bias that needs adding.  */
3466           for (unsigned int i = 0; i < len; ++i)
3467             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3468               nstmts += 1;
3469           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3470                                 NULL, 0, vect_prologue);
3471         }
3472       dump_printf (MSG_NOTE,
3473                    "cost model: Adding cost of checks for loop "
3474                    "versioning aliasing.\n");
3475     }
3476
3477   /* Requires loop versioning with niter checks.  */
3478   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3479     {
3480       /*  FIXME: Make cost depend on complexity of individual check.  */
3481       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3482                             vect_prologue);
3483       dump_printf (MSG_NOTE,
3484                    "cost model: Adding cost of checks for loop "
3485                    "versioning niters.\n");
3486     }
3487
3488   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3489     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3490                           vect_prologue);
3491
3492   /* Count statements in scalar loop.  Using this as scalar cost for a single
3493      iteration for now.
3494
3495      TODO: Add outer loop support.
3496
3497      TODO: Consider assigning different costs to different scalar
3498      statements.  */
3499
3500   scalar_single_iter_cost
3501     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3502
3503   /* Add additional cost for the peeled instructions in prologue and epilogue
3504      loop.  (For fully-masked loops there will be no peeling.)
3505
3506      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3507      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3508
3509      TODO: Build an expression that represents peel_iters for prologue and
3510      epilogue to be used in a run-time test.  */
3511
3512   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3513     {
3514       peel_iters_prologue = 0;
3515       peel_iters_epilogue = 0;
3516
3517       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3518         {
3519           /* We need to peel exactly one iteration.  */
3520           peel_iters_epilogue += 1;
3521           stmt_info_for_cost *si;
3522           int j;
3523           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3524                             j, si)
3525             {
3526               struct _stmt_vec_info *stmt_info
3527                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3528               (void) add_stmt_cost (target_cost_data, si->count,
3529                                     si->kind, stmt_info, si->misalign,
3530                                     vect_epilogue);
3531             }
3532         }
3533     }
3534   else if (npeel < 0)
3535     {
3536       peel_iters_prologue = assumed_vf / 2;
3537       dump_printf (MSG_NOTE, "cost model: "
3538                    "prologue peel iters set to vf/2.\n");
3539
3540       /* If peeling for alignment is unknown, loop bound of main loop becomes
3541          unknown.  */
3542       peel_iters_epilogue = assumed_vf / 2;
3543       dump_printf (MSG_NOTE, "cost model: "
3544                    "epilogue peel iters set to vf/2 because "
3545                    "peeling for alignment is unknown.\n");
3546
3547       /* If peeled iterations are unknown, count a taken branch and a not taken
3548          branch per peeled loop. Even if scalar loop iterations are known,
3549          vector iterations are not known since peeled prologue iterations are
3550          not known. Hence guards remain the same.  */
3551       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3552                             NULL, 0, vect_prologue);
3553       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3554                             NULL, 0, vect_prologue);
3555       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3556                             NULL, 0, vect_epilogue);
3557       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3558                             NULL, 0, vect_epilogue);
3559       stmt_info_for_cost *si;
3560       int j;
3561       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3562         {
3563           struct _stmt_vec_info *stmt_info
3564             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3565           (void) add_stmt_cost (target_cost_data,
3566                                 si->count * peel_iters_prologue,
3567                                 si->kind, stmt_info, si->misalign,
3568                                 vect_prologue);
3569           (void) add_stmt_cost (target_cost_data,
3570                                 si->count * peel_iters_epilogue,
3571                                 si->kind, stmt_info, si->misalign,
3572                                 vect_epilogue);
3573         }
3574     }
3575   else
3576     {
3577       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3578       stmt_info_for_cost *si;
3579       int j;
3580       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3581
3582       prologue_cost_vec.create (2);
3583       epilogue_cost_vec.create (2);
3584       peel_iters_prologue = npeel;
3585
3586       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3587                                           &peel_iters_epilogue,
3588                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3589                                             (loop_vinfo),
3590                                           &prologue_cost_vec,
3591                                           &epilogue_cost_vec);
3592
3593       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3594         {
3595           struct _stmt_vec_info *stmt_info
3596             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3597           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3598                                 si->misalign, vect_prologue);
3599         }
3600
3601       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3602         {
3603           struct _stmt_vec_info *stmt_info
3604             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3605           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3606                                 si->misalign, vect_epilogue);
3607         }
3608
3609       prologue_cost_vec.release ();
3610       epilogue_cost_vec.release ();
3611     }
3612
3613   /* FORNOW: The scalar outside cost is incremented in one of the
3614      following ways:
3615
3616      1. The vectorizer checks for alignment and aliasing and generates
3617      a condition that allows dynamic vectorization.  A cost model
3618      check is ANDED with the versioning condition.  Hence scalar code
3619      path now has the added cost of the versioning check.
3620
3621        if (cost > th & versioning_check)
3622          jmp to vector code
3623
3624      Hence run-time scalar is incremented by not-taken branch cost.
3625
3626      2. The vectorizer then checks if a prologue is required.  If the
3627      cost model check was not done before during versioning, it has to
3628      be done before the prologue check.
3629
3630        if (cost <= th)
3631          prologue = scalar_iters
3632        if (prologue == 0)
3633          jmp to vector code
3634        else
3635          execute prologue
3636        if (prologue == num_iters)
3637          go to exit
3638
3639      Hence the run-time scalar cost is incremented by a taken branch,
3640      plus a not-taken branch, plus a taken branch cost.
3641
3642      3. The vectorizer then checks if an epilogue is required.  If the
3643      cost model check was not done before during prologue check, it
3644      has to be done with the epilogue check.
3645
3646        if (prologue == 0)
3647          jmp to vector code
3648        else
3649          execute prologue
3650        if (prologue == num_iters)
3651          go to exit
3652        vector code:
3653          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3654            jmp to epilogue
3655
3656      Hence the run-time scalar cost should be incremented by 2 taken
3657      branches.
3658
3659      TODO: The back end may reorder the BBS's differently and reverse
3660      conditions/branch directions.  Change the estimates below to
3661      something more reasonable.  */
3662
3663   /* If the number of iterations is known and we do not do versioning, we can
3664      decide whether to vectorize at compile time.  Hence the scalar version
3665      do not carry cost model guard costs.  */
3666   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3667       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3668     {
3669       /* Cost model check occurs at versioning.  */
3670       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3671         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3672       else
3673         {
3674           /* Cost model check occurs at prologue generation.  */
3675           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3676             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3677               + vect_get_stmt_cost (cond_branch_not_taken);
3678           /* Cost model check occurs at epilogue generation.  */
3679           else
3680             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3681         }
3682     }
3683
3684   /* Complete the target-specific cost calculations.  */
3685   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3686                &vec_inside_cost, &vec_epilogue_cost);
3687
3688   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3689
3690   if (dump_enabled_p ())
3691     {
3692       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3693       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3694                    vec_inside_cost);
3695       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3696                    vec_prologue_cost);
3697       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3698                    vec_epilogue_cost);
3699       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3700                    scalar_single_iter_cost);
3701       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3702                    scalar_outside_cost);
3703       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3704                    vec_outside_cost);
3705       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3706                    peel_iters_prologue);
3707       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3708                    peel_iters_epilogue);
3709     }
3710
3711   /* Calculate number of iterations required to make the vector version
3712      profitable, relative to the loop bodies only.  The following condition
3713      must hold true:
3714      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3715      where
3716      SIC = scalar iteration cost, VIC = vector iteration cost,
3717      VOC = vector outside cost, VF = vectorization factor,
3718      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3719      SOC = scalar outside cost for run time cost model check.  */
3720
3721   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3722     {
3723       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3724                               * assumed_vf
3725                               - vec_inside_cost * peel_iters_prologue
3726                               - vec_inside_cost * peel_iters_epilogue);
3727       if (min_profitable_iters <= 0)
3728         min_profitable_iters = 0;
3729       else
3730         {
3731           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3732                                    - vec_inside_cost);
3733
3734           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3735               <= (((int) vec_inside_cost * min_profitable_iters)
3736                   + (((int) vec_outside_cost - scalar_outside_cost)
3737                      * assumed_vf)))
3738             min_profitable_iters++;
3739         }
3740     }
3741   /* vector version will never be profitable.  */
3742   else
3743     {
3744       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3745         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3746                     "did not happen for a simd loop");
3747
3748       if (dump_enabled_p ())
3749         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3750                          "cost model: the vector iteration cost = %d "
3751                          "divided by the scalar iteration cost = %d "
3752                          "is greater or equal to the vectorization factor = %d"
3753                          ".\n",
3754                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3755       *ret_min_profitable_niters = -1;
3756       *ret_min_profitable_estimate = -1;
3757       return;
3758     }
3759
3760   dump_printf (MSG_NOTE,
3761                "  Calculated minimum iters for profitability: %d\n",
3762                min_profitable_iters);
3763
3764   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3765       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3766     /* We want the vectorized loop to execute at least once.  */
3767     min_profitable_iters = assumed_vf + peel_iters_prologue;
3768
3769   if (dump_enabled_p ())
3770     dump_printf_loc (MSG_NOTE, vect_location,
3771                      "  Runtime profitability threshold = %d\n",
3772                      min_profitable_iters);
3773
3774   *ret_min_profitable_niters = min_profitable_iters;
3775
3776   /* Calculate number of iterations required to make the vector version
3777      profitable, relative to the loop bodies only.
3778
3779      Non-vectorized variant is SIC * niters and it must win over vector
3780      variant on the expected loop trip count.  The following condition must hold true:
3781      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3782
3783   if (vec_outside_cost <= 0)
3784     min_profitable_estimate = 0;
3785   else
3786     {
3787       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3788                                  * assumed_vf
3789                                  - vec_inside_cost * peel_iters_prologue
3790                                  - vec_inside_cost * peel_iters_epilogue)
3791                                  / ((scalar_single_iter_cost * assumed_vf)
3792                                    - vec_inside_cost);
3793     }
3794   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3795   if (dump_enabled_p ())
3796     dump_printf_loc (MSG_NOTE, vect_location,
3797                      "  Static estimate profitability threshold = %d\n",
3798                      min_profitable_estimate);
3799
3800   *ret_min_profitable_estimate = min_profitable_estimate;
3801 }
3802
3803 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3804    vector elements (not bits) for a vector with NELT elements.  */
3805 static void
3806 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3807                               vec_perm_builder *sel)
3808 {
3809   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3810      by vec_perm_indices.  */
3811   sel->new_vector (nelt, 1, 3);
3812   for (unsigned int i = 0; i < 3; i++)
3813     sel->quick_push (i + offset);
3814 }
3815
3816 /* Checks whether the target supports whole-vector shifts for vectors of mode
3817    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3818    it supports vec_perm_const with masks for all necessary shift amounts.  */
3819 static bool
3820 have_whole_vector_shift (machine_mode mode)
3821 {
3822   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3823     return true;
3824
3825   /* Variable-length vectors should be handled via the optab.  */
3826   unsigned int nelt;
3827   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3828     return false;
3829
3830   vec_perm_builder sel;
3831   vec_perm_indices indices;
3832   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3833     {
3834       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3835       indices.new_vector (sel, 2, nelt);
3836       if (!can_vec_perm_const_p (mode, indices, false))
3837         return false;
3838     }
3839   return true;
3840 }
3841
3842 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3843    functions. Design better to avoid maintenance issues.  */
3844
3845 /* Function vect_model_reduction_cost.
3846
3847    Models cost for a reduction operation, including the vector ops
3848    generated within the strip-mine loop, the initial definition before
3849    the loop, and the epilogue code that must be generated.  */
3850
3851 static void
3852 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3853                            int ncopies, stmt_vector_for_cost *cost_vec)
3854 {
3855   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3856   enum tree_code code;
3857   optab optab;
3858   tree vectype;
3859   gimple *orig_stmt;
3860   machine_mode mode;
3861   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3862   struct loop *loop = NULL;
3863
3864   if (loop_vinfo)
3865     loop = LOOP_VINFO_LOOP (loop_vinfo);
3866
3867   /* Condition reductions generate two reductions in the loop.  */
3868   vect_reduction_type reduction_type
3869     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3870   if (reduction_type == COND_REDUCTION)
3871     ncopies *= 2;
3872
3873   vectype = STMT_VINFO_VECTYPE (stmt_info);
3874   mode = TYPE_MODE (vectype);
3875   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3876
3877   if (!orig_stmt)
3878     orig_stmt = STMT_VINFO_STMT (stmt_info);
3879
3880   code = gimple_assign_rhs_code (orig_stmt);
3881
3882   if (reduction_type == EXTRACT_LAST_REDUCTION
3883       || reduction_type == FOLD_LEFT_REDUCTION)
3884     {
3885       /* No extra instructions needed in the prologue.  */
3886       prologue_cost = 0;
3887
3888       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3889         /* Count one reduction-like operation per vector.  */
3890         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3891                                         stmt_info, 0, vect_body);
3892       else
3893         {
3894           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3895           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3896           inside_cost = record_stmt_cost (cost_vec, nelements,
3897                                           vec_to_scalar, stmt_info, 0,
3898                                           vect_body);
3899           inside_cost += record_stmt_cost (cost_vec, nelements,
3900                                            scalar_stmt, stmt_info, 0,
3901                                            vect_body);
3902         }
3903     }
3904   else
3905     {
3906       /* Add in cost for initial definition.
3907          For cond reduction we have four vectors: initial index, step,
3908          initial result of the data reduction, initial value of the index
3909          reduction.  */
3910       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3911       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3912                                          scalar_to_vec, stmt_info, 0,
3913                                          vect_prologue);
3914
3915       /* Cost of reduction op inside loop.  */
3916       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3917                                       stmt_info, 0, vect_body);
3918     }
3919
3920   /* Determine cost of epilogue code.
3921
3922      We have a reduction operator that will reduce the vector in one statement.
3923      Also requires scalar extract.  */
3924
3925   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3926     {
3927       if (reduc_fn != IFN_LAST)
3928         {
3929           if (reduction_type == COND_REDUCTION)
3930             {
3931               /* An EQ stmt and an COND_EXPR stmt.  */
3932               epilogue_cost += record_stmt_cost (cost_vec, 2,
3933                                                  vector_stmt, stmt_info, 0,
3934                                                  vect_epilogue);
3935               /* Reduction of the max index and a reduction of the found
3936                  values.  */
3937               epilogue_cost += record_stmt_cost (cost_vec, 2,
3938                                                  vec_to_scalar, stmt_info, 0,
3939                                                  vect_epilogue);
3940               /* A broadcast of the max value.  */
3941               epilogue_cost += record_stmt_cost (cost_vec, 1,
3942                                                  scalar_to_vec, stmt_info, 0,
3943                                                  vect_epilogue);
3944             }
3945           else
3946             {
3947               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3948                                                  stmt_info, 0, vect_epilogue);
3949               epilogue_cost += record_stmt_cost (cost_vec, 1,
3950                                                  vec_to_scalar, stmt_info, 0,
3951                                                  vect_epilogue);
3952             }
3953         }
3954       else if (reduction_type == COND_REDUCTION)
3955         {
3956           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3957           /* Extraction of scalar elements.  */
3958           epilogue_cost += record_stmt_cost (cost_vec,
3959                                              2 * estimated_nunits,
3960                                              vec_to_scalar, stmt_info, 0,
3961                                              vect_epilogue);
3962           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3963           epilogue_cost += record_stmt_cost (cost_vec,
3964                                              2 * estimated_nunits - 3,
3965                                              scalar_stmt, stmt_info, 0,
3966                                              vect_epilogue);
3967         }
3968       else if (reduction_type == EXTRACT_LAST_REDUCTION
3969                || reduction_type == FOLD_LEFT_REDUCTION)
3970         /* No extra instructions need in the epilogue.  */
3971         ;
3972       else
3973         {
3974           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3975           tree bitsize =
3976             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3977           int element_bitsize = tree_to_uhwi (bitsize);
3978           int nelements = vec_size_in_bits / element_bitsize;
3979
3980           if (code == COND_EXPR)
3981             code = MAX_EXPR;
3982
3983           optab = optab_for_tree_code (code, vectype, optab_default);
3984
3985           /* We have a whole vector shift available.  */
3986           if (optab != unknown_optab
3987               && VECTOR_MODE_P (mode)
3988               && optab_handler (optab, mode) != CODE_FOR_nothing
3989               && have_whole_vector_shift (mode))
3990             {
3991               /* Final reduction via vector shifts and the reduction operator.
3992                  Also requires scalar extract.  */
3993               epilogue_cost += record_stmt_cost (cost_vec,
3994                                                  exact_log2 (nelements) * 2,
3995                                                  vector_stmt, stmt_info, 0,
3996                                                  vect_epilogue);
3997               epilogue_cost += record_stmt_cost (cost_vec, 1,
3998                                                  vec_to_scalar, stmt_info, 0,
3999                                                  vect_epilogue);
4000             }
4001           else
4002             /* Use extracts and reduction op for final reduction.  For N
4003                elements, we have N extracts and N-1 reduction ops.  */
4004             epilogue_cost += record_stmt_cost (cost_vec,
4005                                                nelements + nelements - 1,
4006                                                vector_stmt, stmt_info, 0,
4007                                                vect_epilogue);
4008         }
4009     }
4010
4011   if (dump_enabled_p ())
4012     dump_printf (MSG_NOTE,
4013                  "vect_model_reduction_cost: inside_cost = %d, "
4014                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4015                  prologue_cost, epilogue_cost);
4016 }
4017
4018
4019 /* Function vect_model_induction_cost.
4020
4021    Models cost for induction operations.  */
4022
4023 static void
4024 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4025                            stmt_vector_for_cost *cost_vec)
4026 {
4027   unsigned inside_cost, prologue_cost;
4028
4029   if (PURE_SLP_STMT (stmt_info))
4030     return;
4031
4032   /* loop cost for vec_loop.  */
4033   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4034                                   stmt_info, 0, vect_body);
4035
4036   /* prologue cost for vec_init and vec_step.  */
4037   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4038                                     stmt_info, 0, vect_prologue);
4039
4040   if (dump_enabled_p ())
4041     dump_printf_loc (MSG_NOTE, vect_location,
4042                      "vect_model_induction_cost: inside_cost = %d, "
4043                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4044 }
4045
4046
4047
4048 /* Function get_initial_def_for_reduction
4049
4050    Input:
4051    STMT - a stmt that performs a reduction operation in the loop.
4052    INIT_VAL - the initial value of the reduction variable
4053
4054    Output:
4055    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4056         of the reduction (used for adjusting the epilog - see below).
4057    Return a vector variable, initialized according to the operation that STMT
4058         performs. This vector will be used as the initial value of the
4059         vector of partial results.
4060
4061    Option1 (adjust in epilog): Initialize the vector as follows:
4062      add/bit or/xor:    [0,0,...,0,0]
4063      mult/bit and:      [1,1,...,1,1]
4064      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4065    and when necessary (e.g. add/mult case) let the caller know
4066    that it needs to adjust the result by init_val.
4067
4068    Option2: Initialize the vector as follows:
4069      add/bit or/xor:    [init_val,0,0,...,0]
4070      mult/bit and:      [init_val,1,1,...,1]
4071      min/max/cond_expr: [init_val,init_val,...,init_val]
4072    and no adjustments are needed.
4073
4074    For example, for the following code:
4075
4076    s = init_val;
4077    for (i=0;i<n;i++)
4078      s = s + a[i];
4079
4080    STMT is 's = s + a[i]', and the reduction variable is 's'.
4081    For a vector of 4 units, we want to return either [0,0,0,init_val],
4082    or [0,0,0,0] and let the caller know that it needs to adjust
4083    the result at the end by 'init_val'.
4084
4085    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4086    initialization vector is simpler (same element in all entries), if
4087    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4088
4089    A cost model should help decide between these two schemes.  */
4090
4091 tree
4092 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4093                                tree *adjustment_def)
4094 {
4095   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4096   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4097   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4098   tree scalar_type = TREE_TYPE (init_val);
4099   tree vectype = get_vectype_for_scalar_type (scalar_type);
4100   enum tree_code code = gimple_assign_rhs_code (stmt);
4101   tree def_for_init;
4102   tree init_def;
4103   bool nested_in_vect_loop = false;
4104   REAL_VALUE_TYPE real_init_val = dconst0;
4105   int int_init_val = 0;
4106   gimple *def_stmt = NULL;
4107   gimple_seq stmts = NULL;
4108
4109   gcc_assert (vectype);
4110
4111   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4112               || SCALAR_FLOAT_TYPE_P (scalar_type));
4113
4114   if (nested_in_vect_loop_p (loop, stmt))
4115     nested_in_vect_loop = true;
4116   else
4117     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4118
4119   /* In case of double reduction we only create a vector variable to be put
4120      in the reduction phi node.  The actual statement creation is done in
4121      vect_create_epilog_for_reduction.  */
4122   if (adjustment_def && nested_in_vect_loop
4123       && TREE_CODE (init_val) == SSA_NAME
4124       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4125       && gimple_code (def_stmt) == GIMPLE_PHI
4126       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4127       && vinfo_for_stmt (def_stmt)
4128       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4129           == vect_double_reduction_def)
4130     {
4131       *adjustment_def = NULL;
4132       return vect_create_destination_var (init_val, vectype);
4133     }
4134
4135   vect_reduction_type reduction_type
4136     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4137
4138   /* In case of a nested reduction do not use an adjustment def as
4139      that case is not supported by the epilogue generation correctly
4140      if ncopies is not one.  */
4141   if (adjustment_def && nested_in_vect_loop)
4142     {
4143       *adjustment_def = NULL;
4144       return vect_get_vec_def_for_operand (init_val, stmt);
4145     }
4146
4147   switch (code)
4148     {
4149     case WIDEN_SUM_EXPR:
4150     case DOT_PROD_EXPR:
4151     case SAD_EXPR:
4152     case PLUS_EXPR:
4153     case MINUS_EXPR:
4154     case BIT_IOR_EXPR:
4155     case BIT_XOR_EXPR:
4156     case MULT_EXPR:
4157     case BIT_AND_EXPR:
4158       {
4159         /* ADJUSTMENT_DEF is NULL when called from
4160            vect_create_epilog_for_reduction to vectorize double reduction.  */
4161         if (adjustment_def)
4162           *adjustment_def = init_val;
4163
4164         if (code == MULT_EXPR)
4165           {
4166             real_init_val = dconst1;
4167             int_init_val = 1;
4168           }
4169
4170         if (code == BIT_AND_EXPR)
4171           int_init_val = -1;
4172
4173         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4174           def_for_init = build_real (scalar_type, real_init_val);
4175         else
4176           def_for_init = build_int_cst (scalar_type, int_init_val);
4177
4178         if (adjustment_def)
4179           /* Option1: the first element is '0' or '1' as well.  */
4180           init_def = gimple_build_vector_from_val (&stmts, vectype,
4181                                                    def_for_init);
4182         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4183           {
4184             /* Option2 (variable length): the first element is INIT_VAL.  */
4185             init_def = gimple_build_vector_from_val (&stmts, vectype,
4186                                                      def_for_init);
4187             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4188                                      vectype, init_def, init_val);
4189           }
4190         else
4191           {
4192             /* Option2: the first element is INIT_VAL.  */
4193             tree_vector_builder elts (vectype, 1, 2);
4194             elts.quick_push (init_val);
4195             elts.quick_push (def_for_init);
4196             init_def = gimple_build_vector (&stmts, &elts);
4197           }
4198       }
4199       break;
4200
4201     case MIN_EXPR:
4202     case MAX_EXPR:
4203     case COND_EXPR:
4204       {
4205         if (adjustment_def)
4206           {
4207             *adjustment_def = NULL_TREE;
4208             if (reduction_type != COND_REDUCTION
4209                 && reduction_type != EXTRACT_LAST_REDUCTION)
4210               {
4211                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4212                 break;
4213               }
4214           }
4215         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4216         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4217       }
4218       break;
4219
4220     default:
4221       gcc_unreachable ();
4222     }
4223
4224   if (stmts)
4225     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4226   return init_def;
4227 }
4228
4229 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4230    NUMBER_OF_VECTORS is the number of vector defs to create.
4231    If NEUTRAL_OP is nonnull, introducing extra elements of that
4232    value will not change the result.  */
4233
4234 static void
4235 get_initial_defs_for_reduction (slp_tree slp_node,
4236                                 vec<tree> *vec_oprnds,
4237                                 unsigned int number_of_vectors,
4238                                 bool reduc_chain, tree neutral_op)
4239 {
4240   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4241   gimple *stmt = stmts[0];
4242   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4243   unsigned HOST_WIDE_INT nunits;
4244   unsigned j, number_of_places_left_in_vector;
4245   tree vector_type;
4246   tree vop;
4247   int group_size = stmts.length ();
4248   unsigned int vec_num, i;
4249   unsigned number_of_copies = 1;
4250   vec<tree> voprnds;
4251   voprnds.create (number_of_vectors);
4252   struct loop *loop;
4253   auto_vec<tree, 16> permute_results;
4254
4255   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4256
4257   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4258
4259   loop = (gimple_bb (stmt))->loop_father;
4260   gcc_assert (loop);
4261   edge pe = loop_preheader_edge (loop);
4262
4263   gcc_assert (!reduc_chain || neutral_op);
4264
4265   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4266      created vectors. It is greater than 1 if unrolling is performed.
4267
4268      For example, we have two scalar operands, s1 and s2 (e.g., group of
4269      strided accesses of size two), while NUNITS is four (i.e., four scalars
4270      of this type can be packed in a vector).  The output vector will contain
4271      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4272      will be 2).
4273
4274      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4275      vectors containing the operands.
4276
4277      For example, NUNITS is four as before, and the group size is 8
4278      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4279      {s5, s6, s7, s8}.  */
4280
4281   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4282     nunits = group_size;
4283
4284   number_of_copies = nunits * number_of_vectors / group_size;
4285
4286   number_of_places_left_in_vector = nunits;
4287   bool constant_p = true;
4288   tree_vector_builder elts (vector_type, nunits, 1);
4289   elts.quick_grow (nunits);
4290   for (j = 0; j < number_of_copies; j++)
4291     {
4292       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4293         {
4294           tree op;
4295           /* Get the def before the loop.  In reduction chain we have only
4296              one initial value.  */
4297           if ((j != (number_of_copies - 1)
4298                || (reduc_chain && i != 0))
4299               && neutral_op)
4300             op = neutral_op;
4301           else
4302             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4303
4304           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4305           number_of_places_left_in_vector--;
4306           elts[number_of_places_left_in_vector] = op;
4307           if (!CONSTANT_CLASS_P (op))
4308             constant_p = false;
4309
4310           if (number_of_places_left_in_vector == 0)
4311             {
4312               gimple_seq ctor_seq = NULL;
4313               tree init;
4314               if (constant_p && !neutral_op
4315                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4316                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4317                 /* Build the vector directly from ELTS.  */
4318                 init = gimple_build_vector (&ctor_seq, &elts);
4319               else if (neutral_op)
4320                 {
4321                   /* Build a vector of the neutral value and shift the
4322                      other elements into place.  */
4323                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4324                                                        neutral_op);
4325                   int k = nunits;
4326                   while (k > 0 && elts[k - 1] == neutral_op)
4327                     k -= 1;
4328                   while (k > 0)
4329                     {
4330                       k -= 1;
4331                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4332                                            vector_type, init, elts[k]);
4333                     }
4334                 }
4335               else
4336                 {
4337                   /* First time round, duplicate ELTS to fill the
4338                      required number of vectors, then cherry pick the
4339                      appropriate result for each iteration.  */
4340                   if (vec_oprnds->is_empty ())
4341                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4342                                               number_of_vectors,
4343                                               permute_results);
4344                   init = permute_results[number_of_vectors - j - 1];
4345                 }
4346               if (ctor_seq != NULL)
4347                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4348               voprnds.quick_push (init);
4349
4350               number_of_places_left_in_vector = nunits;
4351               elts.new_vector (vector_type, nunits, 1);
4352               elts.quick_grow (nunits);
4353               constant_p = true;
4354             }
4355         }
4356     }
4357
4358   /* Since the vectors are created in the reverse order, we should invert
4359      them.  */
4360   vec_num = voprnds.length ();
4361   for (j = vec_num; j != 0; j--)
4362     {
4363       vop = voprnds[j - 1];
4364       vec_oprnds->quick_push (vop);
4365     }
4366
4367   voprnds.release ();
4368
4369   /* In case that VF is greater than the unrolling factor needed for the SLP
4370      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4371      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4372      to replicate the vectors.  */
4373   tree neutral_vec = NULL;
4374   while (number_of_vectors > vec_oprnds->length ())
4375     {
4376       if (neutral_op)
4377         {
4378           if (!neutral_vec)
4379             {
4380               gimple_seq ctor_seq = NULL;
4381               neutral_vec = gimple_build_vector_from_val
4382                 (&ctor_seq, vector_type, neutral_op);
4383               if (ctor_seq != NULL)
4384                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4385             }
4386           vec_oprnds->quick_push (neutral_vec);
4387         }
4388       else
4389         {
4390           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4391             vec_oprnds->quick_push (vop);
4392         }
4393     }
4394 }
4395
4396
4397 /* Function vect_create_epilog_for_reduction
4398
4399    Create code at the loop-epilog to finalize the result of a reduction
4400    computation.
4401
4402    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4403      reduction statements.
4404    STMT is the scalar reduction stmt that is being vectorized.
4405    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4406      number of elements that we can fit in a vectype (nunits).  In this case
4407      we have to generate more than one vector stmt - i.e - we need to "unroll"
4408      the vector stmt by a factor VF/nunits.  For more details see documentation
4409      in vectorizable_operation.
4410    REDUC_FN is the internal function for the epilog reduction.
4411    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4412      computation.
4413    REDUC_INDEX is the index of the operand in the right hand side of the
4414      statement that is defined by REDUCTION_PHI.
4415    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4416    SLP_NODE is an SLP node containing a group of reduction statements. The
4417      first one in this group is STMT.
4418    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4419      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4420      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4421      any value of the IV in the loop.
4422    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4423    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4424      null if this is not an SLP reduction
4425
4426    This function:
4427    1. Creates the reduction def-use cycles: sets the arguments for
4428       REDUCTION_PHIS:
4429       The loop-entry argument is the vectorized initial-value of the reduction.
4430       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4431       sums.
4432    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4433       by calling the function specified by REDUC_FN if available, or by
4434       other means (whole-vector shifts or a scalar loop).
4435       The function also creates a new phi node at the loop exit to preserve
4436       loop-closed form, as illustrated below.
4437
4438      The flow at the entry to this function:
4439
4440         loop:
4441           vec_def = phi <null, null>            # REDUCTION_PHI
4442           VECT_DEF = vector_stmt                # vectorized form of STMT
4443           s_loop = scalar_stmt                  # (scalar) STMT
4444         loop_exit:
4445           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4446           use <s_out0>
4447           use <s_out0>
4448
4449      The above is transformed by this function into:
4450
4451         loop:
4452           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4453           VECT_DEF = vector_stmt                # vectorized form of STMT
4454           s_loop = scalar_stmt                  # (scalar) STMT
4455         loop_exit:
4456           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4457           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4458           v_out2 = reduce <v_out1>
4459           s_out3 = extract_field <v_out2, 0>
4460           s_out4 = adjust_result <s_out3>
4461           use <s_out4>
4462           use <s_out4>
4463 */
4464
4465 static void
4466 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4467                                   gimple *reduc_def_stmt,
4468                                   int ncopies, internal_fn reduc_fn,
4469                                   vec<gimple *> reduction_phis,
4470                                   bool double_reduc,
4471                                   slp_tree slp_node,
4472                                   slp_instance slp_node_instance,
4473                                   tree induc_val, enum tree_code induc_code,
4474                                   tree neutral_op)
4475 {
4476   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4477   stmt_vec_info prev_phi_info;
4478   tree vectype;
4479   machine_mode mode;
4480   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4481   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4482   basic_block exit_bb;
4483   tree scalar_dest;
4484   tree scalar_type;
4485   gimple *new_phi = NULL, *phi;
4486   gimple_stmt_iterator exit_gsi;
4487   tree vec_dest;
4488   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4489   gimple *epilog_stmt = NULL;
4490   enum tree_code code = gimple_assign_rhs_code (stmt);
4491   gimple *exit_phi;
4492   tree bitsize;
4493   tree adjustment_def = NULL;
4494   tree vec_initial_def = NULL;
4495   tree expr, def, initial_def = NULL;
4496   tree orig_name, scalar_result;
4497   imm_use_iterator imm_iter, phi_imm_iter;
4498   use_operand_p use_p, phi_use_p;
4499   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4500   bool nested_in_vect_loop = false;
4501   auto_vec<gimple *> new_phis;
4502   auto_vec<gimple *> inner_phis;
4503   enum vect_def_type dt = vect_unknown_def_type;
4504   int j, i;
4505   auto_vec<tree> scalar_results;
4506   unsigned int group_size = 1, k, ratio;
4507   auto_vec<tree> vec_initial_defs;
4508   auto_vec<gimple *> phis;
4509   bool slp_reduc = false;
4510   bool direct_slp_reduc;
4511   tree new_phi_result;
4512   gimple *inner_phi = NULL;
4513   tree induction_index = NULL_TREE;
4514
4515   if (slp_node)
4516     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4517
4518   if (nested_in_vect_loop_p (loop, stmt))
4519     {
4520       outer_loop = loop;
4521       loop = loop->inner;
4522       nested_in_vect_loop = true;
4523       gcc_assert (!slp_node);
4524     }
4525
4526   vectype = STMT_VINFO_VECTYPE (stmt_info);
4527   gcc_assert (vectype);
4528   mode = TYPE_MODE (vectype);
4529
4530   /* 1. Create the reduction def-use cycle:
4531      Set the arguments of REDUCTION_PHIS, i.e., transform
4532
4533         loop:
4534           vec_def = phi <null, null>            # REDUCTION_PHI
4535           VECT_DEF = vector_stmt                # vectorized form of STMT
4536           ...
4537
4538      into:
4539
4540         loop:
4541           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4542           VECT_DEF = vector_stmt                # vectorized form of STMT
4543           ...
4544
4545      (in case of SLP, do it for all the phis). */
4546
4547   /* Get the loop-entry arguments.  */
4548   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4549   if (slp_node)
4550     {
4551       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4552       vec_initial_defs.reserve (vec_num);
4553       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4554                                       &vec_initial_defs, vec_num,
4555                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4556                                       neutral_op);
4557     }
4558   else
4559     {
4560       /* Get at the scalar def before the loop, that defines the initial value
4561          of the reduction variable.  */
4562       gimple *def_stmt;
4563       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4564                                            loop_preheader_edge (loop));
4565       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4566          and we can't use zero for induc_val, use initial_def.  Similarly
4567          for REDUC_MIN and initial_def larger than the base.  */
4568       if (TREE_CODE (initial_def) == INTEGER_CST
4569           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4570               == INTEGER_INDUC_COND_REDUCTION)
4571           && !integer_zerop (induc_val)
4572           && ((induc_code == MAX_EXPR
4573                && tree_int_cst_lt (initial_def, induc_val))
4574               || (induc_code == MIN_EXPR
4575                   && tree_int_cst_lt (induc_val, initial_def))))
4576         induc_val = initial_def;
4577       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4578       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4579                                                        &adjustment_def);
4580       vec_initial_defs.create (1);
4581       vec_initial_defs.quick_push (vec_initial_def);
4582     }
4583
4584   /* Set phi nodes arguments.  */
4585   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4586     {
4587       tree vec_init_def = vec_initial_defs[i];
4588       tree def = vect_defs[i];
4589       for (j = 0; j < ncopies; j++)
4590         {
4591           if (j != 0)
4592             {
4593               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4594               if (nested_in_vect_loop)
4595                 vec_init_def
4596                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4597                                                     vec_init_def);
4598             }
4599
4600           /* Set the loop-entry arg of the reduction-phi.  */
4601
4602           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4603               == INTEGER_INDUC_COND_REDUCTION)
4604             {
4605               /* Initialise the reduction phi to zero.  This prevents initial
4606                  values of non-zero interferring with the reduction op.  */
4607               gcc_assert (ncopies == 1);
4608               gcc_assert (i == 0);
4609
4610               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4611               tree induc_val_vec
4612                 = build_vector_from_val (vec_init_def_type, induc_val);
4613
4614               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4615                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4616             }
4617           else
4618             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4619                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4620
4621           /* Set the loop-latch arg for the reduction-phi.  */
4622           if (j > 0)
4623             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4624
4625           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4626                        UNKNOWN_LOCATION);
4627
4628           if (dump_enabled_p ())
4629             {
4630               dump_printf_loc (MSG_NOTE, vect_location,
4631                                "transform reduction: created def-use cycle: ");
4632               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4633               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4634             }
4635         }
4636     }
4637
4638   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4639      which is updated with the current index of the loop for every match of
4640      the original loop's cond_expr (VEC_STMT).  This results in a vector
4641      containing the last time the condition passed for that vector lane.
4642      The first match will be a 1 to allow 0 to be used for non-matching
4643      indexes.  If there are no matches at all then the vector will be all
4644      zeroes.  */
4645   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4646     {
4647       tree indx_before_incr, indx_after_incr;
4648       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4649
4650       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4651       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4652
4653       int scalar_precision
4654         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4655       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4656       tree cr_index_vector_type = build_vector_type
4657         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4658
4659       /* First we create a simple vector induction variable which starts
4660          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4661          vector size (STEP).  */
4662
4663       /* Create a {1,2,3,...} vector.  */
4664       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4665
4666       /* Create a vector of the step value.  */
4667       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4668       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4669
4670       /* Create an induction variable.  */
4671       gimple_stmt_iterator incr_gsi;
4672       bool insert_after;
4673       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4674       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4675                  insert_after, &indx_before_incr, &indx_after_incr);
4676
4677       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4678          filled with zeros (VEC_ZERO).  */
4679
4680       /* Create a vector of 0s.  */
4681       tree zero = build_zero_cst (cr_index_scalar_type);
4682       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4683
4684       /* Create a vector phi node.  */
4685       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4686       new_phi = create_phi_node (new_phi_tree, loop->header);
4687       set_vinfo_for_stmt (new_phi,
4688                           new_stmt_vec_info (new_phi, loop_vinfo));
4689       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4690                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4691
4692       /* Now take the condition from the loops original cond_expr
4693          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4694          every match uses values from the induction variable
4695          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4696          (NEW_PHI_TREE).
4697          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4698          the new cond_expr (INDEX_COND_EXPR).  */
4699
4700       /* Duplicate the condition from vec_stmt.  */
4701       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4702
4703       /* Create a conditional, where the condition is taken from vec_stmt
4704          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4705          else is the phi (NEW_PHI_TREE).  */
4706       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4707                                      ccompare, indx_before_incr,
4708                                      new_phi_tree);
4709       induction_index = make_ssa_name (cr_index_vector_type);
4710       gimple *index_condition = gimple_build_assign (induction_index,
4711                                                      index_cond_expr);
4712       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4713       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4714                                                         loop_vinfo);
4715       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4716       set_vinfo_for_stmt (index_condition, index_vec_info);
4717
4718       /* Update the phi with the vec cond.  */
4719       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4720                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4721     }
4722
4723   /* 2. Create epilog code.
4724         The reduction epilog code operates across the elements of the vector
4725         of partial results computed by the vectorized loop.
4726         The reduction epilog code consists of:
4727
4728         step 1: compute the scalar result in a vector (v_out2)
4729         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4730         step 3: adjust the scalar result (s_out3) if needed.
4731
4732         Step 1 can be accomplished using one the following three schemes:
4733           (scheme 1) using reduc_fn, if available.
4734           (scheme 2) using whole-vector shifts, if available.
4735           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4736                      combined.
4737
4738           The overall epilog code looks like this:
4739
4740           s_out0 = phi <s_loop>         # original EXIT_PHI
4741           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4742           v_out2 = reduce <v_out1>              # step 1
4743           s_out3 = extract_field <v_out2, 0>    # step 2
4744           s_out4 = adjust_result <s_out3>       # step 3
4745
4746           (step 3 is optional, and steps 1 and 2 may be combined).
4747           Lastly, the uses of s_out0 are replaced by s_out4.  */
4748
4749
4750   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4751          v_out1 = phi <VECT_DEF>
4752          Store them in NEW_PHIS.  */
4753
4754   exit_bb = single_exit (loop)->dest;
4755   prev_phi_info = NULL;
4756   new_phis.create (vect_defs.length ());
4757   FOR_EACH_VEC_ELT (vect_defs, i, def)
4758     {
4759       for (j = 0; j < ncopies; j++)
4760         {
4761           tree new_def = copy_ssa_name (def);
4762           phi = create_phi_node (new_def, exit_bb);
4763           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4764           if (j == 0)
4765             new_phis.quick_push (phi);
4766           else
4767             {
4768               def = vect_get_vec_def_for_stmt_copy (dt, def);
4769               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4770             }
4771
4772           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4773           prev_phi_info = vinfo_for_stmt (phi);
4774         }
4775     }
4776
4777   /* The epilogue is created for the outer-loop, i.e., for the loop being
4778      vectorized.  Create exit phis for the outer loop.  */
4779   if (double_reduc)
4780     {
4781       loop = outer_loop;
4782       exit_bb = single_exit (loop)->dest;
4783       inner_phis.create (vect_defs.length ());
4784       FOR_EACH_VEC_ELT (new_phis, i, phi)
4785         {
4786           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4787           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4788           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4789                            PHI_RESULT (phi));
4790           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4791                                                             loop_vinfo));
4792           inner_phis.quick_push (phi);
4793           new_phis[i] = outer_phi;
4794           prev_phi_info = vinfo_for_stmt (outer_phi);
4795           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4796             {
4797               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4798               new_result = copy_ssa_name (PHI_RESULT (phi));
4799               outer_phi = create_phi_node (new_result, exit_bb);
4800               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4801                                PHI_RESULT (phi));
4802               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4803                                                                 loop_vinfo));
4804               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4805               prev_phi_info = vinfo_for_stmt (outer_phi);
4806             }
4807         }
4808     }
4809
4810   exit_gsi = gsi_after_labels (exit_bb);
4811
4812   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4813          (i.e. when reduc_fn is not available) and in the final adjustment
4814          code (if needed).  Also get the original scalar reduction variable as
4815          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4816          represents a reduction pattern), the tree-code and scalar-def are
4817          taken from the original stmt that the pattern-stmt (STMT) replaces.
4818          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4819          are taken from STMT.  */
4820
4821   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4822   if (!orig_stmt)
4823     {
4824       /* Regular reduction  */
4825       orig_stmt = stmt;
4826     }
4827   else
4828     {
4829       /* Reduction pattern  */
4830       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4831       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4832       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4833     }
4834
4835   code = gimple_assign_rhs_code (orig_stmt);
4836   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4837      partial results are added and not subtracted.  */
4838   if (code == MINUS_EXPR)
4839     code = PLUS_EXPR;
4840
4841   scalar_dest = gimple_assign_lhs (orig_stmt);
4842   scalar_type = TREE_TYPE (scalar_dest);
4843   scalar_results.create (group_size);
4844   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4845   bitsize = TYPE_SIZE (scalar_type);
4846
4847   /* In case this is a reduction in an inner-loop while vectorizing an outer
4848      loop - we don't need to extract a single scalar result at the end of the
4849      inner-loop (unless it is double reduction, i.e., the use of reduction is
4850      outside the outer-loop).  The final vector of partial results will be used
4851      in the vectorized outer-loop, or reduced to a scalar result at the end of
4852      the outer-loop.  */
4853   if (nested_in_vect_loop && !double_reduc)
4854     goto vect_finalize_reduction;
4855
4856   /* SLP reduction without reduction chain, e.g.,
4857      # a1 = phi <a2, a0>
4858      # b1 = phi <b2, b0>
4859      a2 = operation (a1)
4860      b2 = operation (b1)  */
4861   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4862
4863   /* True if we should implement SLP_REDUC using native reduction operations
4864      instead of scalar operations.  */
4865   direct_slp_reduc = (reduc_fn != IFN_LAST
4866                       && slp_reduc
4867                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4868
4869   /* In case of reduction chain, e.g.,
4870      # a1 = phi <a3, a0>
4871      a2 = operation (a1)
4872      a3 = operation (a2),
4873
4874      we may end up with more than one vector result.  Here we reduce them to
4875      one vector.  */
4876   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4877     {
4878       tree first_vect = PHI_RESULT (new_phis[0]);
4879       gassign *new_vec_stmt = NULL;
4880       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4881       for (k = 1; k < new_phis.length (); k++)
4882         {
4883           gimple *next_phi = new_phis[k];
4884           tree second_vect = PHI_RESULT (next_phi);
4885           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4886           new_vec_stmt = gimple_build_assign (tem, code,
4887                                               first_vect, second_vect);
4888           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4889           first_vect = tem;
4890         }
4891
4892       new_phi_result = first_vect;
4893       if (new_vec_stmt)
4894         {
4895           new_phis.truncate (0);
4896           new_phis.safe_push (new_vec_stmt);
4897         }
4898     }
4899   /* Likewise if we couldn't use a single defuse cycle.  */
4900   else if (ncopies > 1)
4901     {
4902       gcc_assert (new_phis.length () == 1);
4903       tree first_vect = PHI_RESULT (new_phis[0]);
4904       gassign *new_vec_stmt = NULL;
4905       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4906       gimple *next_phi = new_phis[0];
4907       for (int k = 1; k < ncopies; ++k)
4908         {
4909           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4910           tree second_vect = PHI_RESULT (next_phi);
4911           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4912           new_vec_stmt = gimple_build_assign (tem, code,
4913                                               first_vect, second_vect);
4914           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4915           first_vect = tem;
4916         }
4917       new_phi_result = first_vect;
4918       new_phis.truncate (0);
4919       new_phis.safe_push (new_vec_stmt);
4920     }
4921   else
4922     new_phi_result = PHI_RESULT (new_phis[0]);
4923
4924   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4925       && reduc_fn != IFN_LAST)
4926     {
4927       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4928          various data values where the condition matched and another vector
4929          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4930          need to extract the last matching index (which will be the index with
4931          highest value) and use this to index into the data vector.
4932          For the case where there were no matches, the data vector will contain
4933          all default values and the index vector will be all zeros.  */
4934
4935       /* Get various versions of the type of the vector of indexes.  */
4936       tree index_vec_type = TREE_TYPE (induction_index);
4937       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4938       tree index_scalar_type = TREE_TYPE (index_vec_type);
4939       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4940         (index_vec_type);
4941
4942       /* Get an unsigned integer version of the type of the data vector.  */
4943       int scalar_precision
4944         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4945       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4946       tree vectype_unsigned = build_vector_type
4947         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4948
4949       /* First we need to create a vector (ZERO_VEC) of zeros and another
4950          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4951          can create using a MAX reduction and then expanding.
4952          In the case where the loop never made any matches, the max index will
4953          be zero.  */
4954
4955       /* Vector of {0, 0, 0,...}.  */
4956       tree zero_vec = make_ssa_name (vectype);
4957       tree zero_vec_rhs = build_zero_cst (vectype);
4958       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4959       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4960
4961       /* Find maximum value from the vector of found indexes.  */
4962       tree max_index = make_ssa_name (index_scalar_type);
4963       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4964                                                           1, induction_index);
4965       gimple_call_set_lhs (max_index_stmt, max_index);
4966       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4967
4968       /* Vector of {max_index, max_index, max_index,...}.  */
4969       tree max_index_vec = make_ssa_name (index_vec_type);
4970       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4971                                                       max_index);
4972       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4973                                                         max_index_vec_rhs);
4974       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4975
4976       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4977          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4978          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4979          otherwise.  Only one value should match, resulting in a vector
4980          (VEC_COND) with one data value and the rest zeros.
4981          In the case where the loop never made any matches, every index will
4982          match, resulting in a vector with all data values (which will all be
4983          the default value).  */
4984
4985       /* Compare the max index vector to the vector of found indexes to find
4986          the position of the max value.  */
4987       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4988       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4989                                                       induction_index,
4990                                                       max_index_vec);
4991       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4992
4993       /* Use the compare to choose either values from the data vector or
4994          zero.  */
4995       tree vec_cond = make_ssa_name (vectype);
4996       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4997                                                    vec_compare, new_phi_result,
4998                                                    zero_vec);
4999       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5000
5001       /* Finally we need to extract the data value from the vector (VEC_COND)
5002          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5003          reduction, but because this doesn't exist, we can use a MAX reduction
5004          instead.  The data value might be signed or a float so we need to cast
5005          it first.
5006          In the case where the loop never made any matches, the data values are
5007          all identical, and so will reduce down correctly.  */
5008
5009       /* Make the matched data values unsigned.  */
5010       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5011       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5012                                        vec_cond);
5013       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5014                                                         VIEW_CONVERT_EXPR,
5015                                                         vec_cond_cast_rhs);
5016       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5017
5018       /* Reduce down to a scalar value.  */
5019       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5020       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5021                                                            1, vec_cond_cast);
5022       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5023       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5024
5025       /* Convert the reduced value back to the result type and set as the
5026          result.  */
5027       gimple_seq stmts = NULL;
5028       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5029                                data_reduc);
5030       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5031       scalar_results.safe_push (new_temp);
5032     }
5033   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5034            && reduc_fn == IFN_LAST)
5035     {
5036       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5037          idx = 0;
5038          idx_val = induction_index[0];
5039          val = data_reduc[0];
5040          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5041            if (induction_index[i] > idx_val)
5042              val = data_reduc[i], idx_val = induction_index[i];
5043          return val;  */
5044
5045       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5046       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5047       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5048       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5049       /* Enforced by vectorizable_reduction, which ensures we have target
5050          support before allowing a conditional reduction on variable-length
5051          vectors.  */
5052       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5053       tree idx_val = NULL_TREE, val = NULL_TREE;
5054       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5055         {
5056           tree old_idx_val = idx_val;
5057           tree old_val = val;
5058           idx_val = make_ssa_name (idx_eltype);
5059           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5060                                              build3 (BIT_FIELD_REF, idx_eltype,
5061                                                      induction_index,
5062                                                      bitsize_int (el_size),
5063                                                      bitsize_int (off)));
5064           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5065           val = make_ssa_name (data_eltype);
5066           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5067                                              build3 (BIT_FIELD_REF,
5068                                                      data_eltype,
5069                                                      new_phi_result,
5070                                                      bitsize_int (el_size),
5071                                                      bitsize_int (off)));
5072           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5073           if (off != 0)
5074             {
5075               tree new_idx_val = idx_val;
5076               tree new_val = val;
5077               if (off != v_size - el_size)
5078                 {
5079                   new_idx_val = make_ssa_name (idx_eltype);
5080                   epilog_stmt = gimple_build_assign (new_idx_val,
5081                                                      MAX_EXPR, idx_val,
5082                                                      old_idx_val);
5083                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5084                 }
5085               new_val = make_ssa_name (data_eltype);
5086               epilog_stmt = gimple_build_assign (new_val,
5087                                                  COND_EXPR,
5088                                                  build2 (GT_EXPR,
5089                                                          boolean_type_node,
5090                                                          idx_val,
5091                                                          old_idx_val),
5092                                                  val, old_val);
5093               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5094               idx_val = new_idx_val;
5095               val = new_val;
5096             }
5097         }
5098       /* Convert the reduced value back to the result type and set as the
5099          result.  */
5100       gimple_seq stmts = NULL;
5101       val = gimple_convert (&stmts, scalar_type, val);
5102       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5103       scalar_results.safe_push (val);
5104     }
5105
5106   /* 2.3 Create the reduction code, using one of the three schemes described
5107          above. In SLP we simply need to extract all the elements from the
5108          vector (without reducing them), so we use scalar shifts.  */
5109   else if (reduc_fn != IFN_LAST && !slp_reduc)
5110     {
5111       tree tmp;
5112       tree vec_elem_type;
5113
5114       /* Case 1:  Create:
5115          v_out2 = reduc_expr <v_out1>  */
5116
5117       if (dump_enabled_p ())
5118         dump_printf_loc (MSG_NOTE, vect_location,
5119                          "Reduce using direct vector reduction.\n");
5120
5121       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5122       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5123         {
5124           tree tmp_dest
5125             = vect_create_destination_var (scalar_dest, vec_elem_type);
5126           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5127                                                     new_phi_result);
5128           gimple_set_lhs (epilog_stmt, tmp_dest);
5129           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5130           gimple_set_lhs (epilog_stmt, new_temp);
5131           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5132
5133           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5134                                              new_temp);
5135         }
5136       else
5137         {
5138           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5139                                                     new_phi_result);
5140           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5141         }
5142
5143       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5144       gimple_set_lhs (epilog_stmt, new_temp);
5145       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5146
5147       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5148            == INTEGER_INDUC_COND_REDUCTION)
5149           && !operand_equal_p (initial_def, induc_val, 0))
5150         {
5151           /* Earlier we set the initial value to be a vector if induc_val
5152              values.  Check the result and if it is induc_val then replace
5153              with the original initial value, unless induc_val is
5154              the same as initial_def already.  */
5155           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5156                                   induc_val);
5157
5158           tmp = make_ssa_name (new_scalar_dest);
5159           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5160                                              initial_def, new_temp);
5161           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5162           new_temp = tmp;
5163         }
5164
5165       scalar_results.safe_push (new_temp);
5166     }
5167   else if (direct_slp_reduc)
5168     {
5169       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5170          with the elements for other SLP statements replaced with the
5171          neutral value.  We can then do a normal reduction on each vector.  */
5172
5173       /* Enforced by vectorizable_reduction.  */
5174       gcc_assert (new_phis.length () == 1);
5175       gcc_assert (pow2p_hwi (group_size));
5176
5177       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5178       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5179       gimple_seq seq = NULL;
5180
5181       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5182          and the same element size as VECTYPE.  */
5183       tree index = build_index_vector (vectype, 0, 1);
5184       tree index_type = TREE_TYPE (index);
5185       tree index_elt_type = TREE_TYPE (index_type);
5186       tree mask_type = build_same_sized_truth_vector_type (index_type);
5187
5188       /* Create a vector that, for each element, identifies which of
5189          the REDUC_GROUP_SIZE results should use it.  */
5190       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5191       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5192                             build_vector_from_val (index_type, index_mask));
5193
5194       /* Get a neutral vector value.  This is simply a splat of the neutral
5195          scalar value if we have one, otherwise the initial scalar value
5196          is itself a neutral value.  */
5197       tree vector_identity = NULL_TREE;
5198       if (neutral_op)
5199         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5200                                                         neutral_op);
5201       for (unsigned int i = 0; i < group_size; ++i)
5202         {
5203           /* If there's no univeral neutral value, we can use the
5204              initial scalar value from the original PHI.  This is used
5205              for MIN and MAX reduction, for example.  */
5206           if (!neutral_op)
5207             {
5208               tree scalar_value
5209                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5210                                          loop_preheader_edge (loop));
5211               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5212                                                               scalar_value);
5213             }
5214
5215           /* Calculate the equivalent of:
5216
5217              sel[j] = (index[j] == i);
5218
5219              which selects the elements of NEW_PHI_RESULT that should
5220              be included in the result.  */
5221           tree compare_val = build_int_cst (index_elt_type, i);
5222           compare_val = build_vector_from_val (index_type, compare_val);
5223           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5224                                    index, compare_val);
5225
5226           /* Calculate the equivalent of:
5227
5228              vec = seq ? new_phi_result : vector_identity;
5229
5230              VEC is now suitable for a full vector reduction.  */
5231           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5232                                    sel, new_phi_result, vector_identity);
5233
5234           /* Do the reduction and convert it to the appropriate type.  */
5235           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5236                                       TREE_TYPE (vectype), vec);
5237           scalar = gimple_convert (&seq, scalar_type, scalar);
5238           scalar_results.safe_push (scalar);
5239         }
5240       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5241     }
5242   else
5243     {
5244       bool reduce_with_shift;
5245       tree vec_temp;
5246
5247       /* COND reductions all do the final reduction with MAX_EXPR
5248          or MIN_EXPR.  */
5249       if (code == COND_EXPR)
5250         {
5251           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5252               == INTEGER_INDUC_COND_REDUCTION)
5253             code = induc_code;
5254           else
5255             code = MAX_EXPR;
5256         }
5257
5258       /* See if the target wants to do the final (shift) reduction
5259          in a vector mode of smaller size and first reduce upper/lower
5260          halves against each other.  */
5261       enum machine_mode mode1 = mode;
5262       tree vectype1 = vectype;
5263       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5264       unsigned sz1 = sz;
5265       if (!slp_reduc
5266           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5267         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5268
5269       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5270       reduce_with_shift = have_whole_vector_shift (mode1);
5271       if (!VECTOR_MODE_P (mode1))
5272         reduce_with_shift = false;
5273       else
5274         {
5275           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5276           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5277             reduce_with_shift = false;
5278         }
5279
5280       /* First reduce the vector to the desired vector size we should
5281          do shift reduction on by combining upper and lower halves.  */
5282       new_temp = new_phi_result;
5283       while (sz > sz1)
5284         {
5285           gcc_assert (!slp_reduc);
5286           sz /= 2;
5287           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5288
5289           /* The target has to make sure we support lowpart/highpart
5290              extraction, either via direct vector extract or through
5291              an integer mode punning.  */
5292           tree dst1, dst2;
5293           if (convert_optab_handler (vec_extract_optab,
5294                                      TYPE_MODE (TREE_TYPE (new_temp)),
5295                                      TYPE_MODE (vectype1))
5296               != CODE_FOR_nothing)
5297             {
5298               /* Extract sub-vectors directly once vec_extract becomes
5299                  a conversion optab.  */
5300               dst1 = make_ssa_name (vectype1);
5301               epilog_stmt
5302                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5303                                          build3 (BIT_FIELD_REF, vectype1,
5304                                                  new_temp, TYPE_SIZE (vectype1),
5305                                                  bitsize_int (0)));
5306               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5307               dst2 =  make_ssa_name (vectype1);
5308               epilog_stmt
5309                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5310                                          build3 (BIT_FIELD_REF, vectype1,
5311                                                  new_temp, TYPE_SIZE (vectype1),
5312                                                  bitsize_int (sz * BITS_PER_UNIT)));
5313               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5314             }
5315           else
5316             {
5317               /* Extract via punning to appropriately sized integer mode
5318                  vector.  */
5319               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5320                                                             1);
5321               tree etype = build_vector_type (eltype, 2);
5322               gcc_assert (convert_optab_handler (vec_extract_optab,
5323                                                  TYPE_MODE (etype),
5324                                                  TYPE_MODE (eltype))
5325                           != CODE_FOR_nothing);
5326               tree tem = make_ssa_name (etype);
5327               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5328                                                  build1 (VIEW_CONVERT_EXPR,
5329                                                          etype, new_temp));
5330               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5331               new_temp = tem;
5332               tem = make_ssa_name (eltype);
5333               epilog_stmt
5334                   = gimple_build_assign (tem, BIT_FIELD_REF,
5335                                          build3 (BIT_FIELD_REF, eltype,
5336                                                  new_temp, TYPE_SIZE (eltype),
5337                                                  bitsize_int (0)));
5338               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5339               dst1 = make_ssa_name (vectype1);
5340               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5341                                                  build1 (VIEW_CONVERT_EXPR,
5342                                                          vectype1, tem));
5343               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5344               tem = make_ssa_name (eltype);
5345               epilog_stmt
5346                   = gimple_build_assign (tem, BIT_FIELD_REF,
5347                                          build3 (BIT_FIELD_REF, eltype,
5348                                                  new_temp, TYPE_SIZE (eltype),
5349                                                  bitsize_int (sz * BITS_PER_UNIT)));
5350               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5351               dst2 =  make_ssa_name (vectype1);
5352               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5353                                                  build1 (VIEW_CONVERT_EXPR,
5354                                                          vectype1, tem));
5355               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5356             }
5357
5358           new_temp = make_ssa_name (vectype1);
5359           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5360           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5361         }
5362
5363       if (reduce_with_shift && !slp_reduc)
5364         {
5365           int element_bitsize = tree_to_uhwi (bitsize);
5366           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5367              for variable-length vectors and also requires direct target support
5368              for loop reductions.  */
5369           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5370           int nelements = vec_size_in_bits / element_bitsize;
5371           vec_perm_builder sel;
5372           vec_perm_indices indices;
5373
5374           int elt_offset;
5375
5376           tree zero_vec = build_zero_cst (vectype1);
5377           /* Case 2: Create:
5378              for (offset = nelements/2; offset >= 1; offset/=2)
5379                 {
5380                   Create:  va' = vec_shift <va, offset>
5381                   Create:  va = vop <va, va'>
5382                 }  */
5383
5384           tree rhs;
5385
5386           if (dump_enabled_p ())
5387             dump_printf_loc (MSG_NOTE, vect_location,
5388                              "Reduce using vector shifts\n");
5389
5390           mode1 = TYPE_MODE (vectype1);
5391           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5392           for (elt_offset = nelements / 2;
5393                elt_offset >= 1;
5394                elt_offset /= 2)
5395             {
5396               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5397               indices.new_vector (sel, 2, nelements);
5398               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5399               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5400                                                  new_temp, zero_vec, mask);
5401               new_name = make_ssa_name (vec_dest, epilog_stmt);
5402               gimple_assign_set_lhs (epilog_stmt, new_name);
5403               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5404
5405               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5406                                                  new_temp);
5407               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5408               gimple_assign_set_lhs (epilog_stmt, new_temp);
5409               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5410             }
5411
5412           /* 2.4  Extract the final scalar result.  Create:
5413              s_out3 = extract_field <v_out2, bitpos>  */
5414
5415           if (dump_enabled_p ())
5416             dump_printf_loc (MSG_NOTE, vect_location,
5417                              "extract scalar result\n");
5418
5419           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5420                         bitsize, bitsize_zero_node);
5421           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5422           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5423           gimple_assign_set_lhs (epilog_stmt, new_temp);
5424           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5425           scalar_results.safe_push (new_temp);
5426         }
5427       else
5428         {
5429           /* Case 3: Create:
5430              s = extract_field <v_out2, 0>
5431              for (offset = element_size;
5432                   offset < vector_size;
5433                   offset += element_size;)
5434                {
5435                  Create:  s' = extract_field <v_out2, offset>
5436                  Create:  s = op <s, s'>  // For non SLP cases
5437                }  */
5438
5439           if (dump_enabled_p ())
5440             dump_printf_loc (MSG_NOTE, vect_location,
5441                              "Reduce using scalar code.\n");
5442
5443           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5444           int element_bitsize = tree_to_uhwi (bitsize);
5445           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5446             {
5447               int bit_offset;
5448               if (gimple_code (new_phi) == GIMPLE_PHI)
5449                 vec_temp = PHI_RESULT (new_phi);
5450               else
5451                 vec_temp = gimple_assign_lhs (new_phi);
5452               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5453                                  bitsize_zero_node);
5454               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5455               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5456               gimple_assign_set_lhs (epilog_stmt, new_temp);
5457               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5458
5459               /* In SLP we don't need to apply reduction operation, so we just
5460                  collect s' values in SCALAR_RESULTS.  */
5461               if (slp_reduc)
5462                 scalar_results.safe_push (new_temp);
5463
5464               for (bit_offset = element_bitsize;
5465                    bit_offset < vec_size_in_bits;
5466                    bit_offset += element_bitsize)
5467                 {
5468                   tree bitpos = bitsize_int (bit_offset);
5469                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5470                                      bitsize, bitpos);
5471
5472                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5473                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5474                   gimple_assign_set_lhs (epilog_stmt, new_name);
5475                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5476
5477                   if (slp_reduc)
5478                     {
5479                       /* In SLP we don't need to apply reduction operation, so
5480                          we just collect s' values in SCALAR_RESULTS.  */
5481                       new_temp = new_name;
5482                       scalar_results.safe_push (new_name);
5483                     }
5484                   else
5485                     {
5486                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5487                                                          new_name, new_temp);
5488                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5489                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5490                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5491                     }
5492                 }
5493             }
5494
5495           /* The only case where we need to reduce scalar results in SLP, is
5496              unrolling.  If the size of SCALAR_RESULTS is greater than
5497              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5498              REDUC_GROUP_SIZE.  */
5499           if (slp_reduc)
5500             {
5501               tree res, first_res, new_res;
5502               gimple *new_stmt;
5503
5504               /* Reduce multiple scalar results in case of SLP unrolling.  */
5505               for (j = group_size; scalar_results.iterate (j, &res);
5506                    j++)
5507                 {
5508                   first_res = scalar_results[j % group_size];
5509                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5510                                                   first_res, res);
5511                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5512                   gimple_assign_set_lhs (new_stmt, new_res);
5513                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5514                   scalar_results[j % group_size] = new_res;
5515                 }
5516             }
5517           else
5518             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5519             scalar_results.safe_push (new_temp);
5520         }
5521
5522       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5523            == INTEGER_INDUC_COND_REDUCTION)
5524           && !operand_equal_p (initial_def, induc_val, 0))
5525         {
5526           /* Earlier we set the initial value to be a vector if induc_val
5527              values.  Check the result and if it is induc_val then replace
5528              with the original initial value, unless induc_val is
5529              the same as initial_def already.  */
5530           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5531                                   induc_val);
5532
5533           tree tmp = make_ssa_name (new_scalar_dest);
5534           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5535                                              initial_def, new_temp);
5536           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5537           scalar_results[0] = tmp;
5538         }
5539     }
5540
5541 vect_finalize_reduction:
5542
5543   if (double_reduc)
5544     loop = loop->inner;
5545
5546   /* 2.5 Adjust the final result by the initial value of the reduction
5547          variable. (When such adjustment is not needed, then
5548          'adjustment_def' is zero).  For example, if code is PLUS we create:
5549          new_temp = loop_exit_def + adjustment_def  */
5550
5551   if (adjustment_def)
5552     {
5553       gcc_assert (!slp_reduc);
5554       if (nested_in_vect_loop)
5555         {
5556           new_phi = new_phis[0];
5557           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5558           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5559           new_dest = vect_create_destination_var (scalar_dest, vectype);
5560         }
5561       else
5562         {
5563           new_temp = scalar_results[0];
5564           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5565           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5566           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5567         }
5568
5569       epilog_stmt = gimple_build_assign (new_dest, expr);
5570       new_temp = make_ssa_name (new_dest, epilog_stmt);
5571       gimple_assign_set_lhs (epilog_stmt, new_temp);
5572       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5573       if (nested_in_vect_loop)
5574         {
5575           set_vinfo_for_stmt (epilog_stmt,
5576                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5577           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5578                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5579
5580           if (!double_reduc)
5581             scalar_results.quick_push (new_temp);
5582           else
5583             scalar_results[0] = new_temp;
5584         }
5585       else
5586         scalar_results[0] = new_temp;
5587
5588       new_phis[0] = epilog_stmt;
5589     }
5590
5591   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5592           phis with new adjusted scalar results, i.e., replace use <s_out0>
5593           with use <s_out4>.
5594
5595      Transform:
5596         loop_exit:
5597           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5598           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5599           v_out2 = reduce <v_out1>
5600           s_out3 = extract_field <v_out2, 0>
5601           s_out4 = adjust_result <s_out3>
5602           use <s_out0>
5603           use <s_out0>
5604
5605      into:
5606
5607         loop_exit:
5608           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5609           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5610           v_out2 = reduce <v_out1>
5611           s_out3 = extract_field <v_out2, 0>
5612           s_out4 = adjust_result <s_out3>
5613           use <s_out4>
5614           use <s_out4> */
5615
5616
5617   /* In SLP reduction chain we reduce vector results into one vector if
5618      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5619      LHS of the last stmt in the reduction chain, since we are looking for
5620      the loop exit phi node.  */
5621   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5622     {
5623       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5624       /* Handle reduction patterns.  */
5625       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5626         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5627
5628       scalar_dest = gimple_assign_lhs (dest_stmt);
5629       group_size = 1;
5630     }
5631
5632   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5633      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5634      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5635      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5636      correspond to the first vector stmt, etc.
5637      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5638   if (group_size > new_phis.length ())
5639     {
5640       ratio = group_size / new_phis.length ();
5641       gcc_assert (!(group_size % new_phis.length ()));
5642     }
5643   else
5644     ratio = 1;
5645
5646   for (k = 0; k < group_size; k++)
5647     {
5648       if (k % ratio == 0)
5649         {
5650           epilog_stmt = new_phis[k / ratio];
5651           reduction_phi = reduction_phis[k / ratio];
5652           if (double_reduc)
5653             inner_phi = inner_phis[k / ratio];
5654         }
5655
5656       if (slp_reduc)
5657         {
5658           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5659
5660           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5661           /* SLP statements can't participate in patterns.  */
5662           gcc_assert (!orig_stmt);
5663           scalar_dest = gimple_assign_lhs (current_stmt);
5664         }
5665
5666       phis.create (3);
5667       /* Find the loop-closed-use at the loop exit of the original scalar
5668          result.  (The reduction result is expected to have two immediate uses -
5669          one at the latch block, and one at the loop exit).  */
5670       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5671         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5672             && !is_gimple_debug (USE_STMT (use_p)))
5673           phis.safe_push (USE_STMT (use_p));
5674
5675       /* While we expect to have found an exit_phi because of loop-closed-ssa
5676          form we can end up without one if the scalar cycle is dead.  */
5677
5678       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5679         {
5680           if (outer_loop)
5681             {
5682               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5683               gphi *vect_phi;
5684
5685               /* FORNOW. Currently not supporting the case that an inner-loop
5686                  reduction is not used in the outer-loop (but only outside the
5687                  outer-loop), unless it is double reduction.  */
5688               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5689                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5690                           || double_reduc);
5691
5692               if (double_reduc)
5693                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5694               else
5695                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5696               if (!double_reduc
5697                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5698                       != vect_double_reduction_def)
5699                 continue;
5700
5701               /* Handle double reduction:
5702
5703                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5704                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5705                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5706                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5707
5708                  At that point the regular reduction (stmt2 and stmt3) is
5709                  already vectorized, as well as the exit phi node, stmt4.
5710                  Here we vectorize the phi node of double reduction, stmt1, and
5711                  update all relevant statements.  */
5712
5713               /* Go through all the uses of s2 to find double reduction phi
5714                  node, i.e., stmt1 above.  */
5715               orig_name = PHI_RESULT (exit_phi);
5716               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5717                 {
5718                   stmt_vec_info use_stmt_vinfo;
5719                   stmt_vec_info new_phi_vinfo;
5720                   tree vect_phi_init, preheader_arg, vect_phi_res;
5721                   basic_block bb = gimple_bb (use_stmt);
5722                   gimple *use;
5723
5724                   /* Check that USE_STMT is really double reduction phi
5725                      node.  */
5726                   if (gimple_code (use_stmt) != GIMPLE_PHI
5727                       || gimple_phi_num_args (use_stmt) != 2
5728                       || bb->loop_father != outer_loop)
5729                     continue;
5730                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5731                   if (!use_stmt_vinfo
5732                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5733                           != vect_double_reduction_def)
5734                     continue;
5735
5736                   /* Create vector phi node for double reduction:
5737                      vs1 = phi <vs0, vs2>
5738                      vs1 was created previously in this function by a call to
5739                        vect_get_vec_def_for_operand and is stored in
5740                        vec_initial_def;
5741                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5742                      vs0 is created here.  */
5743
5744                   /* Create vector phi node.  */
5745                   vect_phi = create_phi_node (vec_initial_def, bb);
5746                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5747                                     loop_vec_info_for_loop (outer_loop));
5748                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5749
5750                   /* Create vs0 - initial def of the double reduction phi.  */
5751                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5752                                              loop_preheader_edge (outer_loop));
5753                   vect_phi_init = get_initial_def_for_reduction
5754                     (stmt, preheader_arg, NULL);
5755
5756                   /* Update phi node arguments with vs0 and vs2.  */
5757                   add_phi_arg (vect_phi, vect_phi_init,
5758                                loop_preheader_edge (outer_loop),
5759                                UNKNOWN_LOCATION);
5760                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5761                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5762                   if (dump_enabled_p ())
5763                     {
5764                       dump_printf_loc (MSG_NOTE, vect_location,
5765                                        "created double reduction phi node: ");
5766                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5767                     }
5768
5769                   vect_phi_res = PHI_RESULT (vect_phi);
5770
5771                   /* Replace the use, i.e., set the correct vs1 in the regular
5772                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5773                      loop is redundant.  */
5774                   use = reduction_phi;
5775                   for (j = 0; j < ncopies; j++)
5776                     {
5777                       edge pr_edge = loop_preheader_edge (loop);
5778                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5779                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5780                     }
5781                 }
5782             }
5783         }
5784
5785       phis.release ();
5786       if (nested_in_vect_loop)
5787         {
5788           if (double_reduc)
5789             loop = outer_loop;
5790           else
5791             continue;
5792         }
5793
5794       phis.create (3);
5795       /* Find the loop-closed-use at the loop exit of the original scalar
5796          result.  (The reduction result is expected to have two immediate uses,
5797          one at the latch block, and one at the loop exit).  For double
5798          reductions we are looking for exit phis of the outer loop.  */
5799       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5800         {
5801           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5802             {
5803               if (!is_gimple_debug (USE_STMT (use_p)))
5804                 phis.safe_push (USE_STMT (use_p));
5805             }
5806           else
5807             {
5808               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5809                 {
5810                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5811
5812                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5813                     {
5814                       if (!flow_bb_inside_loop_p (loop,
5815                                              gimple_bb (USE_STMT (phi_use_p)))
5816                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5817                         phis.safe_push (USE_STMT (phi_use_p));
5818                     }
5819                 }
5820             }
5821         }
5822
5823       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5824         {
5825           /* Replace the uses:  */
5826           orig_name = PHI_RESULT (exit_phi);
5827           scalar_result = scalar_results[k];
5828           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5829             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5830               SET_USE (use_p, scalar_result);
5831         }
5832
5833       phis.release ();
5834     }
5835 }
5836
5837 /* Return a vector of type VECTYPE that is equal to the vector select
5838    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5839    before GSI.  */
5840
5841 static tree
5842 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5843                      tree vec, tree identity)
5844 {
5845   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5846   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5847                                           mask, vec, identity);
5848   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5849   return cond;
5850 }
5851
5852 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5853    order, starting with LHS.  Insert the extraction statements before GSI and
5854    associate the new scalar SSA names with variable SCALAR_DEST.
5855    Return the SSA name for the result.  */
5856
5857 static tree
5858 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5859                        tree_code code, tree lhs, tree vector_rhs)
5860 {
5861   tree vectype = TREE_TYPE (vector_rhs);
5862   tree scalar_type = TREE_TYPE (vectype);
5863   tree bitsize = TYPE_SIZE (scalar_type);
5864   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5865   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5866
5867   for (unsigned HOST_WIDE_INT bit_offset = 0;
5868        bit_offset < vec_size_in_bits;
5869        bit_offset += element_bitsize)
5870     {
5871       tree bitpos = bitsize_int (bit_offset);
5872       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5873                          bitsize, bitpos);
5874
5875       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5876       rhs = make_ssa_name (scalar_dest, stmt);
5877       gimple_assign_set_lhs (stmt, rhs);
5878       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5879
5880       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5881       tree new_name = make_ssa_name (scalar_dest, stmt);
5882       gimple_assign_set_lhs (stmt, new_name);
5883       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5884       lhs = new_name;
5885     }
5886   return lhs;
5887 }
5888
5889 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5890    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5891    statement.  CODE is the operation performed by STMT and OPS are
5892    its scalar operands.  REDUC_INDEX is the index of the operand in
5893    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5894    implements in-order reduction, or IFN_LAST if we should open-code it.
5895    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5896    that should be used to control the operation in a fully-masked loop.  */
5897
5898 static bool
5899 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5900                                gimple **vec_stmt, slp_tree slp_node,
5901                                gimple *reduc_def_stmt,
5902                                tree_code code, internal_fn reduc_fn,
5903                                tree ops[3], tree vectype_in,
5904                                int reduc_index, vec_loop_masks *masks)
5905 {
5906   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5907   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5908   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5909   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5910   gimple *new_stmt = NULL;
5911
5912   int ncopies;
5913   if (slp_node)
5914     ncopies = 1;
5915   else
5916     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5917
5918   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5919   gcc_assert (ncopies == 1);
5920   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5921   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5922   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5923               == FOLD_LEFT_REDUCTION);
5924
5925   if (slp_node)
5926     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5927                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5928
5929   tree op0 = ops[1 - reduc_index];
5930
5931   int group_size = 1;
5932   gimple *scalar_dest_def;
5933   auto_vec<tree> vec_oprnds0;
5934   if (slp_node)
5935     {
5936       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5937       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5938       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5939     }
5940   else
5941     {
5942       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5943       vec_oprnds0.create (1);
5944       vec_oprnds0.quick_push (loop_vec_def0);
5945       scalar_dest_def = stmt;
5946     }
5947
5948   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5949   tree scalar_type = TREE_TYPE (scalar_dest);
5950   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5951
5952   int vec_num = vec_oprnds0.length ();
5953   gcc_assert (vec_num == 1 || slp_node);
5954   tree vec_elem_type = TREE_TYPE (vectype_out);
5955   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5956
5957   tree vector_identity = NULL_TREE;
5958   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5959     vector_identity = build_zero_cst (vectype_out);
5960
5961   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5962   int i;
5963   tree def0;
5964   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5965     {
5966       tree mask = NULL_TREE;
5967       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5968         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5969
5970       /* Handle MINUS by adding the negative.  */
5971       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5972         {
5973           tree negated = make_ssa_name (vectype_out);
5974           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5975           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5976           def0 = negated;
5977         }
5978
5979       if (mask)
5980         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5981                                     vector_identity);
5982
5983       /* On the first iteration the input is simply the scalar phi
5984          result, and for subsequent iterations it is the output of
5985          the preceding operation.  */
5986       if (reduc_fn != IFN_LAST)
5987         {
5988           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5989           /* For chained SLP reductions the output of the previous reduction
5990              operation serves as the input of the next. For the final statement
5991              the output cannot be a temporary - we reuse the original
5992              scalar destination of the last statement.  */
5993           if (i != vec_num - 1)
5994             {
5995               gimple_set_lhs (new_stmt, scalar_dest_var);
5996               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5997               gimple_set_lhs (new_stmt, reduc_var);
5998             }
5999         }
6000       else
6001         {
6002           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6003                                              reduc_var, def0);
6004           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6005           /* Remove the statement, so that we can use the same code paths
6006              as for statements that we've just created.  */
6007           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6008           gsi_remove (&tmp_gsi, false);
6009         }
6010
6011       if (i == vec_num - 1)
6012         {
6013           gimple_set_lhs (new_stmt, scalar_dest);
6014           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6015         }
6016       else
6017         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6018
6019       if (slp_node)
6020         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6021     }
6022
6023   if (!slp_node)
6024     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6025
6026   return true;
6027 }
6028
6029 /* Function is_nonwrapping_integer_induction.
6030
6031    Check if STMT (which is part of loop LOOP) both increments and
6032    does not cause overflow.  */
6033
6034 static bool
6035 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6036 {
6037   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6038   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6039   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6040   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6041   widest_int ni, max_loop_value, lhs_max;
6042   bool overflow = false;
6043
6044   /* Make sure the loop is integer based.  */
6045   if (TREE_CODE (base) != INTEGER_CST
6046       || TREE_CODE (step) != INTEGER_CST)
6047     return false;
6048
6049   /* Check that the max size of the loop will not wrap.  */
6050
6051   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6052     return true;
6053
6054   if (! max_stmt_executions (loop, &ni))
6055     return false;
6056
6057   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6058                             &overflow);
6059   if (overflow)
6060     return false;
6061
6062   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6063                             TYPE_SIGN (lhs_type), &overflow);
6064   if (overflow)
6065     return false;
6066
6067   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6068           <= TYPE_PRECISION (lhs_type));
6069 }
6070
6071 /* Function vectorizable_reduction.
6072
6073    Check if STMT performs a reduction operation that can be vectorized.
6074    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6075    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6076    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6077
6078    This function also handles reduction idioms (patterns) that have been
6079    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6080    of this form:
6081      X = pattern_expr (arg0, arg1, ..., X)
6082    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6083    sequence that had been detected and replaced by the pattern-stmt (STMT).
6084
6085    This function also handles reduction of condition expressions, for example:
6086      for (int i = 0; i < N; i++)
6087        if (a[i] < value)
6088          last = a[i];
6089    This is handled by vectorising the loop and creating an additional vector
6090    containing the loop indexes for which "a[i] < value" was true.  In the
6091    function epilogue this is reduced to a single max value and then used to
6092    index into the vector of results.
6093
6094    In some cases of reduction patterns, the type of the reduction variable X is
6095    different than the type of the other arguments of STMT.
6096    In such cases, the vectype that is used when transforming STMT into a vector
6097    stmt is different than the vectype that is used to determine the
6098    vectorization factor, because it consists of a different number of elements
6099    than the actual number of elements that are being operated upon in parallel.
6100
6101    For example, consider an accumulation of shorts into an int accumulator.
6102    On some targets it's possible to vectorize this pattern operating on 8
6103    shorts at a time (hence, the vectype for purposes of determining the
6104    vectorization factor should be V8HI); on the other hand, the vectype that
6105    is used to create the vector form is actually V4SI (the type of the result).
6106
6107    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6108    indicates what is the actual level of parallelism (V8HI in the example), so
6109    that the right vectorization factor would be derived.  This vectype
6110    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6111    be used to create the vectorized stmt.  The right vectype for the vectorized
6112    stmt is obtained from the type of the result X:
6113         get_vectype_for_scalar_type (TREE_TYPE (X))
6114
6115    This means that, contrary to "regular" reductions (or "regular" stmts in
6116    general), the following equation:
6117       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6118    does *NOT* necessarily hold for reduction patterns.  */
6119
6120 bool
6121 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6122                         gimple **vec_stmt, slp_tree slp_node,
6123                         slp_instance slp_node_instance,
6124                         stmt_vector_for_cost *cost_vec)
6125 {
6126   tree vec_dest;
6127   tree scalar_dest;
6128   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6129   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6130   tree vectype_in = NULL_TREE;
6131   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6132   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6133   enum tree_code code, orig_code;
6134   internal_fn reduc_fn;
6135   machine_mode vec_mode;
6136   int op_type;
6137   optab optab;
6138   tree new_temp = NULL_TREE;
6139   gimple *def_stmt;
6140   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6141   gimple *cond_reduc_def_stmt = NULL;
6142   enum tree_code cond_reduc_op_code = ERROR_MARK;
6143   tree scalar_type;
6144   bool is_simple_use;
6145   gimple *orig_stmt;
6146   stmt_vec_info orig_stmt_info = NULL;
6147   int i;
6148   int ncopies;
6149   int epilog_copies;
6150   stmt_vec_info prev_stmt_info, prev_phi_info;
6151   bool single_defuse_cycle = false;
6152   gimple *new_stmt = NULL;
6153   int j;
6154   tree ops[3];
6155   enum vect_def_type dts[3];
6156   bool nested_cycle = false, found_nested_cycle_def = false;
6157   bool double_reduc = false;
6158   basic_block def_bb;
6159   struct loop * def_stmt_loop, *outer_loop = NULL;
6160   tree def_arg;
6161   gimple *def_arg_stmt;
6162   auto_vec<tree> vec_oprnds0;
6163   auto_vec<tree> vec_oprnds1;
6164   auto_vec<tree> vec_oprnds2;
6165   auto_vec<tree> vect_defs;
6166   auto_vec<gimple *> phis;
6167   int vec_num;
6168   tree def0, tem;
6169   bool first_p = true;
6170   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6171   tree cond_reduc_val = NULL_TREE;
6172
6173   /* Make sure it was already recognized as a reduction computation.  */
6174   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6175       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6176     return false;
6177
6178   if (nested_in_vect_loop_p (loop, stmt))
6179     {
6180       outer_loop = loop;
6181       loop = loop->inner;
6182       nested_cycle = true;
6183     }
6184
6185   /* In case of reduction chain we switch to the first stmt in the chain, but
6186      we don't update STMT_INFO, since only the last stmt is marked as reduction
6187      and has reduction properties.  */
6188   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6189       && REDUC_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6190     {
6191       stmt = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6192       first_p = false;
6193     }
6194
6195   if (gimple_code (stmt) == GIMPLE_PHI)
6196     {
6197       /* Analysis is fully done on the reduction stmt invocation.  */
6198       if (! vec_stmt)
6199         {
6200           if (slp_node)
6201             slp_node_instance->reduc_phis = slp_node;
6202
6203           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6204           return true;
6205         }
6206
6207       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6208         /* Leave the scalar phi in place.  Note that checking
6209            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6210            for reductions involving a single statement.  */
6211         return true;
6212
6213       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6214       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6215         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6216
6217       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6218           == EXTRACT_LAST_REDUCTION)
6219         /* Leave the scalar phi in place.  */
6220         return true;
6221
6222       gcc_assert (is_gimple_assign (reduc_stmt));
6223       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6224         {
6225           tree op = gimple_op (reduc_stmt, k);
6226           if (op == gimple_phi_result (stmt))
6227             continue;
6228           if (k == 1
6229               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6230             continue;
6231           if (!vectype_in
6232               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6233                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6234             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6235           break;
6236         }
6237       gcc_assert (vectype_in);
6238
6239       if (slp_node)
6240         ncopies = 1;
6241       else
6242         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6243
6244       use_operand_p use_p;
6245       gimple *use_stmt;
6246       if (ncopies > 1
6247           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6248               <= vect_used_only_live)
6249           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6250           && (use_stmt == reduc_stmt
6251               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6252                   == reduc_stmt)))
6253         single_defuse_cycle = true;
6254
6255       /* Create the destination vector  */
6256       scalar_dest = gimple_assign_lhs (reduc_stmt);
6257       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6258
6259       if (slp_node)
6260         /* The size vect_schedule_slp_instance computes is off for us.  */
6261         vec_num = vect_get_num_vectors
6262           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6263            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6264            vectype_in);
6265       else
6266         vec_num = 1;
6267
6268       /* Generate the reduction PHIs upfront.  */
6269       prev_phi_info = NULL;
6270       for (j = 0; j < ncopies; j++)
6271         {
6272           if (j == 0 || !single_defuse_cycle)
6273             {
6274               for (i = 0; i < vec_num; i++)
6275                 {
6276                   /* Create the reduction-phi that defines the reduction
6277                      operand.  */
6278                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6279                   set_vinfo_for_stmt (new_phi,
6280                                       new_stmt_vec_info (new_phi, loop_vinfo));
6281
6282                   if (slp_node)
6283                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6284                   else
6285                     {
6286                       if (j == 0)
6287                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6288                       else
6289                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6290                       prev_phi_info = vinfo_for_stmt (new_phi);
6291                     }
6292                 }
6293             }
6294         }
6295
6296       return true;
6297     }
6298
6299   /* 1. Is vectorizable reduction?  */
6300   /* Not supportable if the reduction variable is used in the loop, unless
6301      it's a reduction chain.  */
6302   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6303       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6304     return false;
6305
6306   /* Reductions that are not used even in an enclosing outer-loop,
6307      are expected to be "live" (used out of the loop).  */
6308   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6309       && !STMT_VINFO_LIVE_P (stmt_info))
6310     return false;
6311
6312   /* 2. Has this been recognized as a reduction pattern?
6313
6314      Check if STMT represents a pattern that has been recognized
6315      in earlier analysis stages.  For stmts that represent a pattern,
6316      the STMT_VINFO_RELATED_STMT field records the last stmt in
6317      the original sequence that constitutes the pattern.  */
6318
6319   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6320   if (orig_stmt)
6321     {
6322       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6323       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6324       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6325     }
6326
6327   /* 3. Check the operands of the operation.  The first operands are defined
6328         inside the loop body. The last operand is the reduction variable,
6329         which is defined by the loop-header-phi.  */
6330
6331   gcc_assert (is_gimple_assign (stmt));
6332
6333   /* Flatten RHS.  */
6334   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6335     {
6336     case GIMPLE_BINARY_RHS:
6337       code = gimple_assign_rhs_code (stmt);
6338       op_type = TREE_CODE_LENGTH (code);
6339       gcc_assert (op_type == binary_op);
6340       ops[0] = gimple_assign_rhs1 (stmt);
6341       ops[1] = gimple_assign_rhs2 (stmt);
6342       break;
6343
6344     case GIMPLE_TERNARY_RHS:
6345       code = gimple_assign_rhs_code (stmt);
6346       op_type = TREE_CODE_LENGTH (code);
6347       gcc_assert (op_type == ternary_op);
6348       ops[0] = gimple_assign_rhs1 (stmt);
6349       ops[1] = gimple_assign_rhs2 (stmt);
6350       ops[2] = gimple_assign_rhs3 (stmt);
6351       break;
6352
6353     case GIMPLE_UNARY_RHS:
6354       return false;
6355
6356     default:
6357       gcc_unreachable ();
6358     }
6359
6360   if (code == COND_EXPR && slp_node)
6361     return false;
6362
6363   scalar_dest = gimple_assign_lhs (stmt);
6364   scalar_type = TREE_TYPE (scalar_dest);
6365   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6366       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6367     return false;
6368
6369   /* Do not try to vectorize bit-precision reductions.  */
6370   if (!type_has_mode_precision_p (scalar_type))
6371     return false;
6372
6373   /* All uses but the last are expected to be defined in the loop.
6374      The last use is the reduction variable.  In case of nested cycle this
6375      assumption is not true: we use reduc_index to record the index of the
6376      reduction variable.  */
6377   gimple *reduc_def_stmt = NULL;
6378   int reduc_index = -1;
6379   for (i = 0; i < op_type; i++)
6380     {
6381       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6382       if (i == 0 && code == COND_EXPR)
6383         continue;
6384
6385       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6386                                           &def_stmt, &dts[i], &tem);
6387       dt = dts[i];
6388       gcc_assert (is_simple_use);
6389       if (dt == vect_reduction_def)
6390         {
6391           reduc_def_stmt = def_stmt;
6392           reduc_index = i;
6393           continue;
6394         }
6395       else if (tem)
6396         {
6397           /* To properly compute ncopies we are interested in the widest
6398              input type in case we're looking at a widening accumulation.  */
6399           if (!vectype_in
6400               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6401                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6402             vectype_in = tem;
6403         }
6404
6405       if (dt != vect_internal_def
6406           && dt != vect_external_def
6407           && dt != vect_constant_def
6408           && dt != vect_induction_def
6409           && !(dt == vect_nested_cycle && nested_cycle))
6410         return false;
6411
6412       if (dt == vect_nested_cycle)
6413         {
6414           found_nested_cycle_def = true;
6415           reduc_def_stmt = def_stmt;
6416           reduc_index = i;
6417         }
6418
6419       if (i == 1 && code == COND_EXPR)
6420         {
6421           /* Record how value of COND_EXPR is defined.  */
6422           if (dt == vect_constant_def)
6423             {
6424               cond_reduc_dt = dt;
6425               cond_reduc_val = ops[i];
6426             }
6427           if (dt == vect_induction_def
6428               && def_stmt != NULL
6429               && is_nonwrapping_integer_induction (def_stmt, loop))
6430             {
6431               cond_reduc_dt = dt;
6432               cond_reduc_def_stmt = def_stmt;
6433             }
6434         }
6435     }
6436
6437   if (!vectype_in)
6438     vectype_in = vectype_out;
6439
6440   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6441      directy used in stmt.  */
6442   if (reduc_index == -1)
6443     {
6444       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6445         {
6446           if (dump_enabled_p ())
6447             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6448                              "in-order reduction chain without SLP.\n");
6449           return false;
6450         }
6451
6452       if (orig_stmt)
6453         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6454       else
6455         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6456     }
6457
6458   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6459     return false;
6460
6461   if (!(reduc_index == -1
6462         || dts[reduc_index] == vect_reduction_def
6463         || dts[reduc_index] == vect_nested_cycle
6464         || ((dts[reduc_index] == vect_internal_def
6465              || dts[reduc_index] == vect_external_def
6466              || dts[reduc_index] == vect_constant_def
6467              || dts[reduc_index] == vect_induction_def)
6468             && nested_cycle && found_nested_cycle_def)))
6469     {
6470       /* For pattern recognized stmts, orig_stmt might be a reduction,
6471          but some helper statements for the pattern might not, or
6472          might be COND_EXPRs with reduction uses in the condition.  */
6473       gcc_assert (orig_stmt);
6474       return false;
6475     }
6476
6477   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6478   enum vect_reduction_type v_reduc_type
6479     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6480   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6481
6482   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6483   /* If we have a condition reduction, see if we can simplify it further.  */
6484   if (v_reduc_type == COND_REDUCTION)
6485     {
6486       /* TODO: We can't yet handle reduction chains, since we need to treat
6487          each COND_EXPR in the chain specially, not just the last one.
6488          E.g. for:
6489
6490             x_1 = PHI <x_3, ...>
6491             x_2 = a_2 ? ... : x_1;
6492             x_3 = a_3 ? ... : x_2;
6493
6494          we're interested in the last element in x_3 for which a_2 || a_3
6495          is true, whereas the current reduction chain handling would
6496          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6497          as a reduction operation.  */
6498       if (reduc_index == -1)
6499         {
6500           if (dump_enabled_p ())
6501             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6502                              "conditional reduction chains not supported\n");
6503           return false;
6504         }
6505
6506       /* vect_is_simple_reduction ensured that operand 2 is the
6507          loop-carried operand.  */
6508       gcc_assert (reduc_index == 2);
6509
6510       /* Loop peeling modifies initial value of reduction PHI, which
6511          makes the reduction stmt to be transformed different to the
6512          original stmt analyzed.  We need to record reduction code for
6513          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6514          it can be used directly at transform stage.  */
6515       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6516           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6517         {
6518           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6519           gcc_assert (cond_reduc_dt == vect_constant_def);
6520           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6521         }
6522       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6523                                                vectype_in, OPTIMIZE_FOR_SPEED))
6524         {
6525           if (dump_enabled_p ())
6526             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6527                              "optimizing condition reduction with"
6528                              " FOLD_EXTRACT_LAST.\n");
6529           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6530         }
6531       else if (cond_reduc_dt == vect_induction_def)
6532         {
6533           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6534           tree base
6535             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6536           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6537
6538           gcc_assert (TREE_CODE (base) == INTEGER_CST
6539                       && TREE_CODE (step) == INTEGER_CST);
6540           cond_reduc_val = NULL_TREE;
6541           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6542              above base; punt if base is the minimum value of the type for
6543              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6544           if (tree_int_cst_sgn (step) == -1)
6545             {
6546               cond_reduc_op_code = MIN_EXPR;
6547               if (tree_int_cst_sgn (base) == -1)
6548                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6549               else if (tree_int_cst_lt (base,
6550                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6551                 cond_reduc_val
6552                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6553             }
6554           else
6555             {
6556               cond_reduc_op_code = MAX_EXPR;
6557               if (tree_int_cst_sgn (base) == 1)
6558                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6559               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6560                                         base))
6561                 cond_reduc_val
6562                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6563             }
6564           if (cond_reduc_val)
6565             {
6566               if (dump_enabled_p ())
6567                 dump_printf_loc (MSG_NOTE, vect_location,
6568                                  "condition expression based on "
6569                                  "integer induction.\n");
6570               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6571                 = INTEGER_INDUC_COND_REDUCTION;
6572             }
6573         }
6574       else if (cond_reduc_dt == vect_constant_def)
6575         {
6576           enum vect_def_type cond_initial_dt;
6577           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6578           tree cond_initial_val
6579             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6580
6581           gcc_assert (cond_reduc_val != NULL_TREE);
6582           vect_is_simple_use (cond_initial_val, loop_vinfo,
6583                               &def_stmt, &cond_initial_dt);
6584           if (cond_initial_dt == vect_constant_def
6585               && types_compatible_p (TREE_TYPE (cond_initial_val),
6586                                      TREE_TYPE (cond_reduc_val)))
6587             {
6588               tree e = fold_binary (LE_EXPR, boolean_type_node,
6589                                     cond_initial_val, cond_reduc_val);
6590               if (e && (integer_onep (e) || integer_zerop (e)))
6591                 {
6592                   if (dump_enabled_p ())
6593                     dump_printf_loc (MSG_NOTE, vect_location,
6594                                      "condition expression based on "
6595                                      "compile time constant.\n");
6596                   /* Record reduction code at analysis stage.  */
6597                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6598                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6599                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6600                     = CONST_COND_REDUCTION;
6601                 }
6602             }
6603         }
6604     }
6605
6606   if (orig_stmt)
6607     gcc_assert (tmp == orig_stmt
6608                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6609                     == orig_stmt));
6610   else
6611     /* We changed STMT to be the first stmt in reduction chain, hence we
6612        check that in this case the first element in the chain is STMT.  */
6613     gcc_assert (stmt == tmp
6614                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6615
6616   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6617     return false;
6618
6619   if (slp_node)
6620     ncopies = 1;
6621   else
6622     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6623
6624   gcc_assert (ncopies >= 1);
6625
6626   vec_mode = TYPE_MODE (vectype_in);
6627   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6628
6629   if (code == COND_EXPR)
6630     {
6631       /* Only call during the analysis stage, otherwise we'll lose
6632          STMT_VINFO_TYPE.  */
6633       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6634                                                 ops[reduc_index], 0, NULL,
6635                                                 cost_vec))
6636         {
6637           if (dump_enabled_p ())
6638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6639                              "unsupported condition in reduction\n");
6640           return false;
6641         }
6642     }
6643   else
6644     {
6645       /* 4. Supportable by target?  */
6646
6647       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6648           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6649         {
6650           /* Shifts and rotates are only supported by vectorizable_shifts,
6651              not vectorizable_reduction.  */
6652           if (dump_enabled_p ())
6653             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6654                              "unsupported shift or rotation.\n");
6655           return false;
6656         }
6657
6658       /* 4.1. check support for the operation in the loop  */
6659       optab = optab_for_tree_code (code, vectype_in, optab_default);
6660       if (!optab)
6661         {
6662           if (dump_enabled_p ())
6663             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6664                              "no optab.\n");
6665
6666           return false;
6667         }
6668
6669       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6670         {
6671           if (dump_enabled_p ())
6672             dump_printf (MSG_NOTE, "op not supported by target.\n");
6673
6674           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6675               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6676             return false;
6677
6678           if (dump_enabled_p ())
6679             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6680         }
6681
6682       /* Worthwhile without SIMD support?  */
6683       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6684           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6685         {
6686           if (dump_enabled_p ())
6687             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6688                              "not worthwhile without SIMD support.\n");
6689
6690           return false;
6691         }
6692     }
6693
6694   /* 4.2. Check support for the epilog operation.
6695
6696           If STMT represents a reduction pattern, then the type of the
6697           reduction variable may be different than the type of the rest
6698           of the arguments.  For example, consider the case of accumulation
6699           of shorts into an int accumulator; The original code:
6700                         S1: int_a = (int) short_a;
6701           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6702
6703           was replaced with:
6704                         STMT: int_acc = widen_sum <short_a, int_acc>
6705
6706           This means that:
6707           1. The tree-code that is used to create the vector operation in the
6708              epilog code (that reduces the partial results) is not the
6709              tree-code of STMT, but is rather the tree-code of the original
6710              stmt from the pattern that STMT is replacing.  I.e, in the example
6711              above we want to use 'widen_sum' in the loop, but 'plus' in the
6712              epilog.
6713           2. The type (mode) we use to check available target support
6714              for the vector operation to be created in the *epilog*, is
6715              determined by the type of the reduction variable (in the example
6716              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6717              However the type (mode) we use to check available target support
6718              for the vector operation to be created *inside the loop*, is
6719              determined by the type of the other arguments to STMT (in the
6720              example we'd check this: optab_handler (widen_sum_optab,
6721              vect_short_mode)).
6722
6723           This is contrary to "regular" reductions, in which the types of all
6724           the arguments are the same as the type of the reduction variable.
6725           For "regular" reductions we can therefore use the same vector type
6726           (and also the same tree-code) when generating the epilog code and
6727           when generating the code inside the loop.  */
6728
6729   vect_reduction_type reduction_type
6730     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6731   if (orig_stmt
6732       && (reduction_type == TREE_CODE_REDUCTION
6733           || reduction_type == FOLD_LEFT_REDUCTION))
6734     {
6735       /* This is a reduction pattern: get the vectype from the type of the
6736          reduction variable, and get the tree-code from orig_stmt.  */
6737       orig_code = gimple_assign_rhs_code (orig_stmt);
6738       gcc_assert (vectype_out);
6739       vec_mode = TYPE_MODE (vectype_out);
6740     }
6741   else
6742     {
6743       /* Regular reduction: use the same vectype and tree-code as used for
6744          the vector code inside the loop can be used for the epilog code. */
6745       orig_code = code;
6746
6747       if (code == MINUS_EXPR)
6748         orig_code = PLUS_EXPR;
6749
6750       /* For simple condition reductions, replace with the actual expression
6751          we want to base our reduction around.  */
6752       if (reduction_type == CONST_COND_REDUCTION)
6753         {
6754           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6755           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6756         }
6757       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6758         orig_code = cond_reduc_op_code;
6759     }
6760
6761   if (nested_cycle)
6762     {
6763       def_bb = gimple_bb (reduc_def_stmt);
6764       def_stmt_loop = def_bb->loop_father;
6765       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6766                                        loop_preheader_edge (def_stmt_loop));
6767       if (TREE_CODE (def_arg) == SSA_NAME
6768           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6769           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6770           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6771           && vinfo_for_stmt (def_arg_stmt)
6772           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6773               == vect_double_reduction_def)
6774         double_reduc = true;
6775     }
6776
6777   reduc_fn = IFN_LAST;
6778
6779   if (reduction_type == TREE_CODE_REDUCTION
6780       || reduction_type == FOLD_LEFT_REDUCTION
6781       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6782       || reduction_type == CONST_COND_REDUCTION)
6783     {
6784       if (reduction_type == FOLD_LEFT_REDUCTION
6785           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6786           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6787         {
6788           if (reduc_fn != IFN_LAST
6789               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6790                                                   OPTIMIZE_FOR_SPEED))
6791             {
6792               if (dump_enabled_p ())
6793                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6794                                  "reduc op not supported by target.\n");
6795
6796               reduc_fn = IFN_LAST;
6797             }
6798         }
6799       else
6800         {
6801           if (!nested_cycle || double_reduc)
6802             {
6803               if (dump_enabled_p ())
6804                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6805                                  "no reduc code for scalar code.\n");
6806
6807               return false;
6808             }
6809         }
6810     }
6811   else if (reduction_type == COND_REDUCTION)
6812     {
6813       int scalar_precision
6814         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6815       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6816       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6817                                                 nunits_out);
6818
6819       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6820                                           OPTIMIZE_FOR_SPEED))
6821         reduc_fn = IFN_REDUC_MAX;
6822     }
6823
6824   if (reduction_type != EXTRACT_LAST_REDUCTION
6825       && reduc_fn == IFN_LAST
6826       && !nunits_out.is_constant ())
6827     {
6828       if (dump_enabled_p ())
6829         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6830                          "missing target support for reduction on"
6831                          " variable-length vectors.\n");
6832       return false;
6833     }
6834
6835   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6836       && ncopies > 1)
6837     {
6838       if (dump_enabled_p ())
6839         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6840                          "multiple types in double reduction or condition "
6841                          "reduction.\n");
6842       return false;
6843     }
6844
6845   /* For SLP reductions, see if there is a neutral value we can use.  */
6846   tree neutral_op = NULL_TREE;
6847   if (slp_node)
6848     neutral_op = neutral_op_for_slp_reduction
6849                    (slp_node_instance->reduc_phis, code,
6850                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6851
6852   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6853     {
6854       /* We can't support in-order reductions of code such as this:
6855
6856            for (int i = 0; i < n1; ++i)
6857              for (int j = 0; j < n2; ++j)
6858                l += a[j];
6859
6860          since GCC effectively transforms the loop when vectorizing:
6861
6862            for (int i = 0; i < n1 / VF; ++i)
6863              for (int j = 0; j < n2; ++j)
6864                for (int k = 0; k < VF; ++k)
6865                  l += a[j];
6866
6867          which is a reassociation of the original operation.  */
6868       if (dump_enabled_p ())
6869         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6870                          "in-order double reduction not supported.\n");
6871
6872       return false;
6873     }
6874
6875   if (reduction_type == FOLD_LEFT_REDUCTION
6876       && slp_node
6877       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6878     {
6879       /* We cannot use in-order reductions in this case because there is
6880          an implicit reassociation of the operations involved.  */
6881       if (dump_enabled_p ())
6882         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6883                          "in-order unchained SLP reductions not supported.\n");
6884       return false;
6885     }
6886
6887   /* For double reductions, and for SLP reductions with a neutral value,
6888      we construct a variable-length initial vector by loading a vector
6889      full of the neutral value and then shift-and-inserting the start
6890      values into the low-numbered elements.  */
6891   if ((double_reduc || neutral_op)
6892       && !nunits_out.is_constant ()
6893       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6894                                           vectype_out, OPTIMIZE_FOR_SPEED))
6895     {
6896       if (dump_enabled_p ())
6897         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898                          "reduction on variable-length vectors requires"
6899                          " target support for a vector-shift-and-insert"
6900                          " operation.\n");
6901       return false;
6902     }
6903
6904   /* Check extra constraints for variable-length unchained SLP reductions.  */
6905   if (STMT_SLP_TYPE (stmt_info)
6906       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6907       && !nunits_out.is_constant ())
6908     {
6909       /* We checked above that we could build the initial vector when
6910          there's a neutral element value.  Check here for the case in
6911          which each SLP statement has its own initial value and in which
6912          that value needs to be repeated for every instance of the
6913          statement within the initial vector.  */
6914       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6915       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6916       if (!neutral_op
6917           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6918         {
6919           if (dump_enabled_p ())
6920             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6921                              "unsupported form of SLP reduction for"
6922                              " variable-length vectors: cannot build"
6923                              " initial vector.\n");
6924           return false;
6925         }
6926       /* The epilogue code relies on the number of elements being a multiple
6927          of the group size.  The duplicate-and-interleave approach to setting
6928          up the the initial vector does too.  */
6929       if (!multiple_p (nunits_out, group_size))
6930         {
6931           if (dump_enabled_p ())
6932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6933                              "unsupported form of SLP reduction for"
6934                              " variable-length vectors: the vector size"
6935                              " is not a multiple of the number of results.\n");
6936           return false;
6937         }
6938     }
6939
6940   /* In case of widenning multiplication by a constant, we update the type
6941      of the constant to be the type of the other operand.  We check that the
6942      constant fits the type in the pattern recognition pass.  */
6943   if (code == DOT_PROD_EXPR
6944       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6945     {
6946       if (TREE_CODE (ops[0]) == INTEGER_CST)
6947         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6948       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6949         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6950       else
6951         {
6952           if (dump_enabled_p ())
6953             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6954                              "invalid types in dot-prod\n");
6955
6956           return false;
6957         }
6958     }
6959
6960   if (reduction_type == COND_REDUCTION)
6961     {
6962       widest_int ni;
6963
6964       if (! max_loop_iterations (loop, &ni))
6965         {
6966           if (dump_enabled_p ())
6967             dump_printf_loc (MSG_NOTE, vect_location,
6968                              "loop count not known, cannot create cond "
6969                              "reduction.\n");
6970           return false;
6971         }
6972       /* Convert backedges to iterations.  */
6973       ni += 1;
6974
6975       /* The additional index will be the same type as the condition.  Check
6976          that the loop can fit into this less one (because we'll use up the
6977          zero slot for when there are no matches).  */
6978       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6979       if (wi::geu_p (ni, wi::to_widest (max_index)))
6980         {
6981           if (dump_enabled_p ())
6982             dump_printf_loc (MSG_NOTE, vect_location,
6983                              "loop size is greater than data size.\n");
6984           return false;
6985         }
6986     }
6987
6988   /* In case the vectorization factor (VF) is bigger than the number
6989      of elements that we can fit in a vectype (nunits), we have to generate
6990      more than one vector stmt - i.e - we need to "unroll" the
6991      vector stmt by a factor VF/nunits.  For more details see documentation
6992      in vectorizable_operation.  */
6993
6994   /* If the reduction is used in an outer loop we need to generate
6995      VF intermediate results, like so (e.g. for ncopies=2):
6996         r0 = phi (init, r0)
6997         r1 = phi (init, r1)
6998         r0 = x0 + r0;
6999         r1 = x1 + r1;
7000     (i.e. we generate VF results in 2 registers).
7001     In this case we have a separate def-use cycle for each copy, and therefore
7002     for each copy we get the vector def for the reduction variable from the
7003     respective phi node created for this copy.
7004
7005     Otherwise (the reduction is unused in the loop nest), we can combine
7006     together intermediate results, like so (e.g. for ncopies=2):
7007         r = phi (init, r)
7008         r = x0 + r;
7009         r = x1 + r;
7010    (i.e. we generate VF/2 results in a single register).
7011    In this case for each copy we get the vector def for the reduction variable
7012    from the vectorized reduction operation generated in the previous iteration.
7013
7014    This only works when we see both the reduction PHI and its only consumer
7015    in vectorizable_reduction and there are no intermediate stmts
7016    participating.  */
7017   use_operand_p use_p;
7018   gimple *use_stmt;
7019   if (ncopies > 1
7020       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7021       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7022       && (use_stmt == stmt
7023           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7024     {
7025       single_defuse_cycle = true;
7026       epilog_copies = 1;
7027     }
7028   else
7029     epilog_copies = ncopies;
7030
7031   /* If the reduction stmt is one of the patterns that have lane
7032      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7033   if ((ncopies > 1
7034        && ! single_defuse_cycle)
7035       && (code == DOT_PROD_EXPR
7036           || code == WIDEN_SUM_EXPR
7037           || code == SAD_EXPR))
7038     {
7039       if (dump_enabled_p ())
7040         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7041                          "multi def-use cycle not possible for lane-reducing "
7042                          "reduction operation\n");
7043       return false;
7044     }
7045
7046   if (slp_node)
7047     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7048   else
7049     vec_num = 1;
7050
7051   internal_fn cond_fn = get_conditional_internal_fn (code);
7052   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7053
7054   if (!vec_stmt) /* transformation not required.  */
7055     {
7056       if (first_p)
7057         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7058       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7059         {
7060           if (reduction_type != FOLD_LEFT_REDUCTION
7061               && (cond_fn == IFN_LAST
7062                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7063                                                       OPTIMIZE_FOR_SPEED)))
7064             {
7065               if (dump_enabled_p ())
7066                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7067                                  "can't use a fully-masked loop because no"
7068                                  " conditional operation is available.\n");
7069               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7070             }
7071           else if (reduc_index == -1)
7072             {
7073               if (dump_enabled_p ())
7074                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7075                                  "can't use a fully-masked loop for chained"
7076                                  " reductions.\n");
7077               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7078             }
7079           else
7080             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7081                                    vectype_in);
7082         }
7083       if (dump_enabled_p ()
7084           && reduction_type == FOLD_LEFT_REDUCTION)
7085         dump_printf_loc (MSG_NOTE, vect_location,
7086                          "using an in-order (fold-left) reduction.\n");
7087       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7088       return true;
7089     }
7090
7091   /* Transform.  */
7092
7093   if (dump_enabled_p ())
7094     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7095
7096   /* FORNOW: Multiple types are not supported for condition.  */
7097   if (code == COND_EXPR)
7098     gcc_assert (ncopies == 1);
7099
7100   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7101
7102   if (reduction_type == FOLD_LEFT_REDUCTION)
7103     return vectorize_fold_left_reduction
7104       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7105        reduc_fn, ops, vectype_in, reduc_index, masks);
7106
7107   if (reduction_type == EXTRACT_LAST_REDUCTION)
7108     {
7109       gcc_assert (!slp_node);
7110       return vectorizable_condition (stmt, gsi, vec_stmt,
7111                                      NULL, reduc_index, NULL, NULL);
7112     }
7113
7114   /* Create the destination vector  */
7115   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7116
7117   prev_stmt_info = NULL;
7118   prev_phi_info = NULL;
7119   if (!slp_node)
7120     {
7121       vec_oprnds0.create (1);
7122       vec_oprnds1.create (1);
7123       if (op_type == ternary_op)
7124         vec_oprnds2.create (1);
7125     }
7126
7127   phis.create (vec_num);
7128   vect_defs.create (vec_num);
7129   if (!slp_node)
7130     vect_defs.quick_push (NULL_TREE);
7131
7132   if (slp_node)
7133     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7134   else
7135     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7136
7137   for (j = 0; j < ncopies; j++)
7138     {
7139       if (code == COND_EXPR)
7140         {
7141           gcc_assert (!slp_node);
7142           vectorizable_condition (stmt, gsi, vec_stmt,
7143                                   PHI_RESULT (phis[0]),
7144                                   reduc_index, NULL, NULL);
7145           /* Multiple types are not supported for condition.  */
7146           break;
7147         }
7148
7149       /* Handle uses.  */
7150       if (j == 0)
7151         {
7152           if (slp_node)
7153             {
7154               /* Get vec defs for all the operands except the reduction index,
7155                  ensuring the ordering of the ops in the vector is kept.  */
7156               auto_vec<tree, 3> slp_ops;
7157               auto_vec<vec<tree>, 3> vec_defs;
7158
7159               slp_ops.quick_push (ops[0]);
7160               slp_ops.quick_push (ops[1]);
7161               if (op_type == ternary_op)
7162                 slp_ops.quick_push (ops[2]);
7163
7164               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7165
7166               vec_oprnds0.safe_splice (vec_defs[0]);
7167               vec_defs[0].release ();
7168               vec_oprnds1.safe_splice (vec_defs[1]);
7169               vec_defs[1].release ();
7170               if (op_type == ternary_op)
7171                 {
7172                   vec_oprnds2.safe_splice (vec_defs[2]);
7173                   vec_defs[2].release ();
7174                 }
7175             }
7176           else
7177             {
7178               vec_oprnds0.quick_push
7179                 (vect_get_vec_def_for_operand (ops[0], stmt));
7180               vec_oprnds1.quick_push
7181                 (vect_get_vec_def_for_operand (ops[1], stmt));
7182               if (op_type == ternary_op)
7183                 vec_oprnds2.quick_push
7184                   (vect_get_vec_def_for_operand (ops[2], stmt));
7185             }
7186         }
7187       else
7188         {
7189           if (!slp_node)
7190             {
7191               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7192
7193               if (single_defuse_cycle && reduc_index == 0)
7194                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7195               else
7196                 vec_oprnds0[0]
7197                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7198               if (single_defuse_cycle && reduc_index == 1)
7199                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7200               else
7201                 vec_oprnds1[0]
7202                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7203               if (op_type == ternary_op)
7204                 {
7205                   if (single_defuse_cycle && reduc_index == 2)
7206                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7207                   else
7208                     vec_oprnds2[0]
7209                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7210                 }
7211             }
7212         }
7213
7214       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7215         {
7216           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7217           if (masked_loop_p)
7218             {
7219               /* Make sure that the reduction accumulator is vop[0].  */
7220               if (reduc_index == 1)
7221                 {
7222                   gcc_assert (commutative_tree_code (code));
7223                   std::swap (vop[0], vop[1]);
7224                 }
7225               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7226                                               vectype_in, i * ncopies + j);
7227               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7228                                                         vop[0], vop[1],
7229                                                         vop[0]);
7230               new_temp = make_ssa_name (vec_dest, call);
7231               gimple_call_set_lhs (call, new_temp);
7232               gimple_call_set_nothrow (call, true);
7233               new_stmt = call;
7234             }
7235           else
7236             {
7237               if (op_type == ternary_op)
7238                 vop[2] = vec_oprnds2[i];
7239
7240               new_temp = make_ssa_name (vec_dest, new_stmt);
7241               new_stmt = gimple_build_assign (new_temp, code,
7242                                               vop[0], vop[1], vop[2]);
7243             }
7244           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7245
7246           if (slp_node)
7247             {
7248               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7249               vect_defs.quick_push (new_temp);
7250             }
7251           else
7252             vect_defs[0] = new_temp;
7253         }
7254
7255       if (slp_node)
7256         continue;
7257
7258       if (j == 0)
7259         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7260       else
7261         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7262
7263       prev_stmt_info = vinfo_for_stmt (new_stmt);
7264     }
7265
7266   /* Finalize the reduction-phi (set its arguments) and create the
7267      epilog reduction code.  */
7268   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7269     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7270
7271   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7272                                     epilog_copies, reduc_fn, phis,
7273                                     double_reduc, slp_node, slp_node_instance,
7274                                     cond_reduc_val, cond_reduc_op_code,
7275                                     neutral_op);
7276
7277   return true;
7278 }
7279
7280 /* Function vect_min_worthwhile_factor.
7281
7282    For a loop where we could vectorize the operation indicated by CODE,
7283    return the minimum vectorization factor that makes it worthwhile
7284    to use generic vectors.  */
7285 static unsigned int
7286 vect_min_worthwhile_factor (enum tree_code code)
7287 {
7288   switch (code)
7289     {
7290     case PLUS_EXPR:
7291     case MINUS_EXPR:
7292     case NEGATE_EXPR:
7293       return 4;
7294
7295     case BIT_AND_EXPR:
7296     case BIT_IOR_EXPR:
7297     case BIT_XOR_EXPR:
7298     case BIT_NOT_EXPR:
7299       return 2;
7300
7301     default:
7302       return INT_MAX;
7303     }
7304 }
7305
7306 /* Return true if VINFO indicates we are doing loop vectorization and if
7307    it is worth decomposing CODE operations into scalar operations for
7308    that loop's vectorization factor.  */
7309
7310 bool
7311 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7312 {
7313   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7314   unsigned HOST_WIDE_INT value;
7315   return (loop_vinfo
7316           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7317           && value >= vect_min_worthwhile_factor (code));
7318 }
7319
7320 /* Function vectorizable_induction
7321
7322    Check if PHI performs an induction computation that can be vectorized.
7323    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7324    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7325    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7326
7327 bool
7328 vectorizable_induction (gimple *phi,
7329                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7330                         gimple **vec_stmt, slp_tree slp_node,
7331                         stmt_vector_for_cost *cost_vec)
7332 {
7333   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7334   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7335   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7336   unsigned ncopies;
7337   bool nested_in_vect_loop = false;
7338   struct loop *iv_loop;
7339   tree vec_def;
7340   edge pe = loop_preheader_edge (loop);
7341   basic_block new_bb;
7342   tree new_vec, vec_init, vec_step, t;
7343   tree new_name;
7344   gimple *new_stmt;
7345   gphi *induction_phi;
7346   tree induc_def, vec_dest;
7347   tree init_expr, step_expr;
7348   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7349   unsigned i;
7350   tree expr;
7351   gimple_seq stmts;
7352   imm_use_iterator imm_iter;
7353   use_operand_p use_p;
7354   gimple *exit_phi;
7355   edge latch_e;
7356   tree loop_arg;
7357   gimple_stmt_iterator si;
7358   basic_block bb = gimple_bb (phi);
7359
7360   if (gimple_code (phi) != GIMPLE_PHI)
7361     return false;
7362
7363   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7364     return false;
7365
7366   /* Make sure it was recognized as induction computation.  */
7367   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7368     return false;
7369
7370   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7371   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7372
7373   if (slp_node)
7374     ncopies = 1;
7375   else
7376     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7377   gcc_assert (ncopies >= 1);
7378
7379   /* FORNOW. These restrictions should be relaxed.  */
7380   if (nested_in_vect_loop_p (loop, phi))
7381     {
7382       imm_use_iterator imm_iter;
7383       use_operand_p use_p;
7384       gimple *exit_phi;
7385       edge latch_e;
7386       tree loop_arg;
7387
7388       if (ncopies > 1)
7389         {
7390           if (dump_enabled_p ())
7391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7392                              "multiple types in nested loop.\n");
7393           return false;
7394         }
7395
7396       /* FORNOW: outer loop induction with SLP not supported.  */
7397       if (STMT_SLP_TYPE (stmt_info))
7398         return false;
7399
7400       exit_phi = NULL;
7401       latch_e = loop_latch_edge (loop->inner);
7402       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7403       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7404         {
7405           gimple *use_stmt = USE_STMT (use_p);
7406           if (is_gimple_debug (use_stmt))
7407             continue;
7408
7409           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7410             {
7411               exit_phi = use_stmt;
7412               break;
7413             }
7414         }
7415       if (exit_phi)
7416         {
7417           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7418           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7419                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7420             {
7421               if (dump_enabled_p ())
7422                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7423                                  "inner-loop induction only used outside "
7424                                  "of the outer vectorized loop.\n");
7425               return false;
7426             }
7427         }
7428
7429       nested_in_vect_loop = true;
7430       iv_loop = loop->inner;
7431     }
7432   else
7433     iv_loop = loop;
7434   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7435
7436   if (slp_node && !nunits.is_constant ())
7437     {
7438       /* The current SLP code creates the initial value element-by-element.  */
7439       if (dump_enabled_p ())
7440         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7441                          "SLP induction not supported for variable-length"
7442                          " vectors.\n");
7443       return false;
7444     }
7445
7446   if (!vec_stmt) /* transformation not required.  */
7447     {
7448       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7449       DUMP_VECT_SCOPE ("vectorizable_induction");
7450       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7451       return true;
7452     }
7453
7454   /* Transform.  */
7455
7456   /* Compute a vector variable, initialized with the first VF values of
7457      the induction variable.  E.g., for an iv with IV_PHI='X' and
7458      evolution S, for a vector of 4 units, we want to compute:
7459      [X, X + S, X + 2*S, X + 3*S].  */
7460
7461   if (dump_enabled_p ())
7462     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7463
7464   latch_e = loop_latch_edge (iv_loop);
7465   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7466
7467   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7468   gcc_assert (step_expr != NULL_TREE);
7469
7470   pe = loop_preheader_edge (iv_loop);
7471   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7472                                      loop_preheader_edge (iv_loop));
7473
7474   stmts = NULL;
7475   if (!nested_in_vect_loop)
7476     {
7477       /* Convert the initial value to the desired type.  */
7478       tree new_type = TREE_TYPE (vectype);
7479       init_expr = gimple_convert (&stmts, new_type, init_expr);
7480
7481       /* If we are using the loop mask to "peel" for alignment then we need
7482          to adjust the start value here.  */
7483       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7484       if (skip_niters != NULL_TREE)
7485         {
7486           if (FLOAT_TYPE_P (vectype))
7487             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7488                                         skip_niters);
7489           else
7490             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7491           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7492                                          skip_niters, step_expr);
7493           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7494                                     init_expr, skip_step);
7495         }
7496     }
7497
7498   /* Convert the step to the desired type.  */
7499   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7500
7501   if (stmts)
7502     {
7503       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7504       gcc_assert (!new_bb);
7505     }
7506
7507   /* Find the first insertion point in the BB.  */
7508   si = gsi_after_labels (bb);
7509
7510   /* For SLP induction we have to generate several IVs as for example
7511      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7512      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7513      [VF*S, VF*S, VF*S, VF*S] for all.  */
7514   if (slp_node)
7515     {
7516       /* Enforced above.  */
7517       unsigned int const_nunits = nunits.to_constant ();
7518
7519       /* Generate [VF*S, VF*S, ... ].  */
7520       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7521         {
7522           expr = build_int_cst (integer_type_node, vf);
7523           expr = fold_convert (TREE_TYPE (step_expr), expr);
7524         }
7525       else
7526         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7527       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7528                               expr, step_expr);
7529       if (! CONSTANT_CLASS_P (new_name))
7530         new_name = vect_init_vector (phi, new_name,
7531                                      TREE_TYPE (step_expr), NULL);
7532       new_vec = build_vector_from_val (vectype, new_name);
7533       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7534
7535       /* Now generate the IVs.  */
7536       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7537       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7538       unsigned elts = const_nunits * nvects;
7539       unsigned nivs = least_common_multiple (group_size,
7540                                              const_nunits) / const_nunits;
7541       gcc_assert (elts % group_size == 0);
7542       tree elt = init_expr;
7543       unsigned ivn;
7544       for (ivn = 0; ivn < nivs; ++ivn)
7545         {
7546           tree_vector_builder elts (vectype, const_nunits, 1);
7547           stmts = NULL;
7548           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7549             {
7550               if (ivn*const_nunits + eltn >= group_size
7551                   && (ivn * const_nunits + eltn) % group_size == 0)
7552                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7553                                     elt, step_expr);
7554               elts.quick_push (elt);
7555             }
7556           vec_init = gimple_build_vector (&stmts, &elts);
7557           if (stmts)
7558             {
7559               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7560               gcc_assert (!new_bb);
7561             }
7562
7563           /* Create the induction-phi that defines the induction-operand.  */
7564           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7565           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7566           set_vinfo_for_stmt (induction_phi,
7567                               new_stmt_vec_info (induction_phi, loop_vinfo));
7568           induc_def = PHI_RESULT (induction_phi);
7569
7570           /* Create the iv update inside the loop  */
7571           vec_def = make_ssa_name (vec_dest);
7572           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7573           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7574           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7575
7576           /* Set the arguments of the phi node:  */
7577           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7578           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7579                        UNKNOWN_LOCATION);
7580
7581           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7582         }
7583
7584       /* Re-use IVs when we can.  */
7585       if (ivn < nvects)
7586         {
7587           unsigned vfp
7588             = least_common_multiple (group_size, const_nunits) / group_size;
7589           /* Generate [VF'*S, VF'*S, ... ].  */
7590           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7591             {
7592               expr = build_int_cst (integer_type_node, vfp);
7593               expr = fold_convert (TREE_TYPE (step_expr), expr);
7594             }
7595           else
7596             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7597           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7598                                   expr, step_expr);
7599           if (! CONSTANT_CLASS_P (new_name))
7600             new_name = vect_init_vector (phi, new_name,
7601                                          TREE_TYPE (step_expr), NULL);
7602           new_vec = build_vector_from_val (vectype, new_name);
7603           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7604           for (; ivn < nvects; ++ivn)
7605             {
7606               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7607               tree def;
7608               if (gimple_code (iv) == GIMPLE_PHI)
7609                 def = gimple_phi_result (iv);
7610               else
7611                 def = gimple_assign_lhs (iv);
7612               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7613                                               PLUS_EXPR,
7614                                               def, vec_step);
7615               if (gimple_code (iv) == GIMPLE_PHI)
7616                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7617               else
7618                 {
7619                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7620                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7621                 }
7622               set_vinfo_for_stmt (new_stmt,
7623                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7624               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7625             }
7626         }
7627
7628       return true;
7629     }
7630
7631   /* Create the vector that holds the initial_value of the induction.  */
7632   if (nested_in_vect_loop)
7633     {
7634       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7635          been created during vectorization of previous stmts.  We obtain it
7636          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7637       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7638       /* If the initial value is not of proper type, convert it.  */
7639       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7640         {
7641           new_stmt
7642             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7643                                                           vect_simple_var,
7644                                                           "vec_iv_"),
7645                                    VIEW_CONVERT_EXPR,
7646                                    build1 (VIEW_CONVERT_EXPR, vectype,
7647                                            vec_init));
7648           vec_init = gimple_assign_lhs (new_stmt);
7649           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7650                                                  new_stmt);
7651           gcc_assert (!new_bb);
7652           set_vinfo_for_stmt (new_stmt,
7653                               new_stmt_vec_info (new_stmt, loop_vinfo));
7654         }
7655     }
7656   else
7657     {
7658       /* iv_loop is the loop to be vectorized. Create:
7659          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7660       stmts = NULL;
7661       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7662
7663       unsigned HOST_WIDE_INT const_nunits;
7664       if (nunits.is_constant (&const_nunits))
7665         {
7666           tree_vector_builder elts (vectype, const_nunits, 1);
7667           elts.quick_push (new_name);
7668           for (i = 1; i < const_nunits; i++)
7669             {
7670               /* Create: new_name_i = new_name + step_expr  */
7671               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7672                                        new_name, step_expr);
7673               elts.quick_push (new_name);
7674             }
7675           /* Create a vector from [new_name_0, new_name_1, ...,
7676              new_name_nunits-1]  */
7677           vec_init = gimple_build_vector (&stmts, &elts);
7678         }
7679       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7680         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7681         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7682                                  new_name, step_expr);
7683       else
7684         {
7685           /* Build:
7686                 [base, base, base, ...]
7687                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7688           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7689           gcc_assert (flag_associative_math);
7690           tree index = build_index_vector (vectype, 0, 1);
7691           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7692                                                         new_name);
7693           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7694                                                         step_expr);
7695           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7696           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7697                                    vec_init, step_vec);
7698           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7699                                    vec_init, base_vec);
7700         }
7701
7702       if (stmts)
7703         {
7704           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7705           gcc_assert (!new_bb);
7706         }
7707     }
7708
7709
7710   /* Create the vector that holds the step of the induction.  */
7711   if (nested_in_vect_loop)
7712     /* iv_loop is nested in the loop to be vectorized. Generate:
7713        vec_step = [S, S, S, S]  */
7714     new_name = step_expr;
7715   else
7716     {
7717       /* iv_loop is the loop to be vectorized. Generate:
7718           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7719       gimple_seq seq = NULL;
7720       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7721         {
7722           expr = build_int_cst (integer_type_node, vf);
7723           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7724         }
7725       else
7726         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7727       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7728                                expr, step_expr);
7729       if (seq)
7730         {
7731           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7732           gcc_assert (!new_bb);
7733         }
7734     }
7735
7736   t = unshare_expr (new_name);
7737   gcc_assert (CONSTANT_CLASS_P (new_name)
7738               || TREE_CODE (new_name) == SSA_NAME);
7739   new_vec = build_vector_from_val (vectype, t);
7740   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7741
7742
7743   /* Create the following def-use cycle:
7744      loop prolog:
7745          vec_init = ...
7746          vec_step = ...
7747      loop:
7748          vec_iv = PHI <vec_init, vec_loop>
7749          ...
7750          STMT
7751          ...
7752          vec_loop = vec_iv + vec_step;  */
7753
7754   /* Create the induction-phi that defines the induction-operand.  */
7755   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7756   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7757   set_vinfo_for_stmt (induction_phi,
7758                       new_stmt_vec_info (induction_phi, loop_vinfo));
7759   induc_def = PHI_RESULT (induction_phi);
7760
7761   /* Create the iv update inside the loop  */
7762   vec_def = make_ssa_name (vec_dest);
7763   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7764   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7765   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7766
7767   /* Set the arguments of the phi node:  */
7768   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7769   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7770                UNKNOWN_LOCATION);
7771
7772   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7773
7774   /* In case that vectorization factor (VF) is bigger than the number
7775      of elements that we can fit in a vectype (nunits), we have to generate
7776      more than one vector stmt - i.e - we need to "unroll" the
7777      vector stmt by a factor VF/nunits.  For more details see documentation
7778      in vectorizable_operation.  */
7779
7780   if (ncopies > 1)
7781     {
7782       gimple_seq seq = NULL;
7783       stmt_vec_info prev_stmt_vinfo;
7784       /* FORNOW. This restriction should be relaxed.  */
7785       gcc_assert (!nested_in_vect_loop);
7786
7787       /* Create the vector that holds the step of the induction.  */
7788       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7789         {
7790           expr = build_int_cst (integer_type_node, nunits);
7791           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7792         }
7793       else
7794         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7795       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7796                                expr, step_expr);
7797       if (seq)
7798         {
7799           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7800           gcc_assert (!new_bb);
7801         }
7802
7803       t = unshare_expr (new_name);
7804       gcc_assert (CONSTANT_CLASS_P (new_name)
7805                   || TREE_CODE (new_name) == SSA_NAME);
7806       new_vec = build_vector_from_val (vectype, t);
7807       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7808
7809       vec_def = induc_def;
7810       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7811       for (i = 1; i < ncopies; i++)
7812         {
7813           /* vec_i = vec_prev + vec_step  */
7814           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7815                                           vec_def, vec_step);
7816           vec_def = make_ssa_name (vec_dest, new_stmt);
7817           gimple_assign_set_lhs (new_stmt, vec_def);
7818
7819           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7820           set_vinfo_for_stmt (new_stmt,
7821                               new_stmt_vec_info (new_stmt, loop_vinfo));
7822           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7823           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7824         }
7825     }
7826
7827   if (nested_in_vect_loop)
7828     {
7829       /* Find the loop-closed exit-phi of the induction, and record
7830          the final vector of induction results:  */
7831       exit_phi = NULL;
7832       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7833         {
7834           gimple *use_stmt = USE_STMT (use_p);
7835           if (is_gimple_debug (use_stmt))
7836             continue;
7837
7838           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7839             {
7840               exit_phi = use_stmt;
7841               break;
7842             }
7843         }
7844       if (exit_phi)
7845         {
7846           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7847           /* FORNOW. Currently not supporting the case that an inner-loop induction
7848              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7849           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7850                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7851
7852           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7853           if (dump_enabled_p ())
7854             {
7855               dump_printf_loc (MSG_NOTE, vect_location,
7856                                "vector of inductions after inner-loop:");
7857               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7858             }
7859         }
7860     }
7861
7862
7863   if (dump_enabled_p ())
7864     {
7865       dump_printf_loc (MSG_NOTE, vect_location,
7866                        "transform induction: created def-use cycle: ");
7867       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7868       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7869                         SSA_NAME_DEF_STMT (vec_def), 0);
7870     }
7871
7872   return true;
7873 }
7874
7875 /* Function vectorizable_live_operation.
7876
7877    STMT computes a value that is used outside the loop.  Check if
7878    it can be supported.  */
7879
7880 bool
7881 vectorizable_live_operation (gimple *stmt,
7882                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7883                              slp_tree slp_node, int slp_index,
7884                              gimple **vec_stmt,
7885                              stmt_vector_for_cost *)
7886 {
7887   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7888   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7889   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7890   imm_use_iterator imm_iter;
7891   tree lhs, lhs_type, bitsize, vec_bitsize;
7892   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7893   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7894   int ncopies;
7895   gimple *use_stmt;
7896   auto_vec<tree> vec_oprnds;
7897   int vec_entry = 0;
7898   poly_uint64 vec_index = 0;
7899
7900   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7901
7902   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7903     return false;
7904
7905   /* FORNOW.  CHECKME.  */
7906   if (nested_in_vect_loop_p (loop, stmt))
7907     return false;
7908
7909   /* If STMT is not relevant and it is a simple assignment and its inputs are
7910      invariant then it can remain in place, unvectorized.  The original last
7911      scalar value that it computes will be used.  */
7912   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7913     {
7914       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7915       if (dump_enabled_p ())
7916         dump_printf_loc (MSG_NOTE, vect_location,
7917                          "statement is simple and uses invariant.  Leaving in "
7918                          "place.\n");
7919       return true;
7920     }
7921
7922   if (slp_node)
7923     ncopies = 1;
7924   else
7925     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7926
7927   if (slp_node)
7928     {
7929       gcc_assert (slp_index >= 0);
7930
7931       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7932       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7933
7934       /* Get the last occurrence of the scalar index from the concatenation of
7935          all the slp vectors. Calculate which slp vector it is and the index
7936          within.  */
7937       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7938
7939       /* Calculate which vector contains the result, and which lane of
7940          that vector we need.  */
7941       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7942         {
7943           if (dump_enabled_p ())
7944             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7945                              "Cannot determine which vector holds the"
7946                              " final result.\n");
7947           return false;
7948         }
7949     }
7950
7951   if (!vec_stmt)
7952     {
7953       /* No transformation required.  */
7954       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7955         {
7956           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7957                                                OPTIMIZE_FOR_SPEED))
7958             {
7959               if (dump_enabled_p ())
7960                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7961                                  "can't use a fully-masked loop because "
7962                                  "the target doesn't support extract last "
7963                                  "reduction.\n");
7964               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7965             }
7966           else if (slp_node)
7967             {
7968               if (dump_enabled_p ())
7969                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7970                                  "can't use a fully-masked loop because an "
7971                                  "SLP statement is live after the loop.\n");
7972               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7973             }
7974           else if (ncopies > 1)
7975             {
7976               if (dump_enabled_p ())
7977                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7978                                  "can't use a fully-masked loop because"
7979                                  " ncopies is greater than 1.\n");
7980               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7981             }
7982           else
7983             {
7984               gcc_assert (ncopies == 1 && !slp_node);
7985               vect_record_loop_mask (loop_vinfo,
7986                                      &LOOP_VINFO_MASKS (loop_vinfo),
7987                                      1, vectype);
7988             }
7989         }
7990       return true;
7991     }
7992
7993   /* If stmt has a related stmt, then use that for getting the lhs.  */
7994   if (is_pattern_stmt_p (stmt_info))
7995     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7996
7997   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7998         : gimple_get_lhs (stmt);
7999   lhs_type = TREE_TYPE (lhs);
8000
8001   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8002              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8003              : TYPE_SIZE (TREE_TYPE (vectype)));
8004   vec_bitsize = TYPE_SIZE (vectype);
8005
8006   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8007   tree vec_lhs, bitstart;
8008   if (slp_node)
8009     {
8010       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8011
8012       /* Get the correct slp vectorized stmt.  */
8013       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8014       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8015         vec_lhs = gimple_phi_result (phi);
8016       else
8017         vec_lhs = gimple_get_lhs (vec_stmt);
8018
8019       /* Get entry to use.  */
8020       bitstart = bitsize_int (vec_index);
8021       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8022     }
8023   else
8024     {
8025       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8026       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8027       gcc_checking_assert (ncopies == 1
8028                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8029
8030       /* For multiple copies, get the last copy.  */
8031       for (int i = 1; i < ncopies; ++i)
8032         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8033                                                   vec_lhs);
8034
8035       /* Get the last lane in the vector.  */
8036       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8037     }
8038
8039   gimple_seq stmts = NULL;
8040   tree new_tree;
8041   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8042     {
8043       /* Emit:
8044
8045            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8046
8047          where VEC_LHS is the vectorized live-out result and MASK is
8048          the loop mask for the final iteration.  */
8049       gcc_assert (ncopies == 1 && !slp_node);
8050       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8051       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8052                                       1, vectype, 0);
8053       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8054                                       scalar_type, mask, vec_lhs);
8055
8056       /* Convert the extracted vector element to the required scalar type.  */
8057       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8058     }
8059   else
8060     {
8061       tree bftype = TREE_TYPE (vectype);
8062       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8063         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8064       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8065       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8066                                        &stmts, true, NULL_TREE);
8067     }
8068
8069   if (stmts)
8070     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8071
8072   /* Replace use of lhs with newly computed result.  If the use stmt is a
8073      single arg PHI, just replace all uses of PHI result.  It's necessary
8074      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8075   use_operand_p use_p;
8076   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8077     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8078         && !is_gimple_debug (use_stmt))
8079     {
8080       if (gimple_code (use_stmt) == GIMPLE_PHI
8081           && gimple_phi_num_args (use_stmt) == 1)
8082         {
8083           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8084         }
8085       else
8086         {
8087           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8088             SET_USE (use_p, new_tree);
8089         }
8090       update_stmt (use_stmt);
8091     }
8092
8093   return true;
8094 }
8095
8096 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8097
8098 static void
8099 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8100 {
8101   ssa_op_iter op_iter;
8102   imm_use_iterator imm_iter;
8103   def_operand_p def_p;
8104   gimple *ustmt;
8105
8106   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8107     {
8108       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8109         {
8110           basic_block bb;
8111
8112           if (!is_gimple_debug (ustmt))
8113             continue;
8114
8115           bb = gimple_bb (ustmt);
8116
8117           if (!flow_bb_inside_loop_p (loop, bb))
8118             {
8119               if (gimple_debug_bind_p (ustmt))
8120                 {
8121                   if (dump_enabled_p ())
8122                     dump_printf_loc (MSG_NOTE, vect_location,
8123                                      "killing debug use\n");
8124
8125                   gimple_debug_bind_reset_value (ustmt);
8126                   update_stmt (ustmt);
8127                 }
8128               else
8129                 gcc_unreachable ();
8130             }
8131         }
8132     }
8133 }
8134
8135 /* Given loop represented by LOOP_VINFO, return true if computation of
8136    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8137    otherwise.  */
8138
8139 static bool
8140 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8141 {
8142   /* Constant case.  */
8143   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8144     {
8145       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8146       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8147
8148       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8149       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8150       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8151         return true;
8152     }
8153
8154   widest_int max;
8155   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8156   /* Check the upper bound of loop niters.  */
8157   if (get_max_loop_iterations (loop, &max))
8158     {
8159       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8160       signop sgn = TYPE_SIGN (type);
8161       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8162       if (max < type_max)
8163         return true;
8164     }
8165   return false;
8166 }
8167
8168 /* Return a mask type with half the number of elements as TYPE.  */
8169
8170 tree
8171 vect_halve_mask_nunits (tree type)
8172 {
8173   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8174   return build_truth_vector_type (nunits, current_vector_size);
8175 }
8176
8177 /* Return a mask type with twice as many elements as TYPE.  */
8178
8179 tree
8180 vect_double_mask_nunits (tree type)
8181 {
8182   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8183   return build_truth_vector_type (nunits, current_vector_size);
8184 }
8185
8186 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8187    contain a sequence of NVECTORS masks that each control a vector of type
8188    VECTYPE.  */
8189
8190 void
8191 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8192                        unsigned int nvectors, tree vectype)
8193 {
8194   gcc_assert (nvectors != 0);
8195   if (masks->length () < nvectors)
8196     masks->safe_grow_cleared (nvectors);
8197   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8198   /* The number of scalars per iteration and the number of vectors are
8199      both compile-time constants.  */
8200   unsigned int nscalars_per_iter
8201     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8202                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8203   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8204     {
8205       rgm->max_nscalars_per_iter = nscalars_per_iter;
8206       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8207     }
8208 }
8209
8210 /* Given a complete set of masks MASKS, extract mask number INDEX
8211    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8212    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8213
8214    See the comment above vec_loop_masks for more details about the mask
8215    arrangement.  */
8216
8217 tree
8218 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8219                     unsigned int nvectors, tree vectype, unsigned int index)
8220 {
8221   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8222   tree mask_type = rgm->mask_type;
8223
8224   /* Populate the rgroup's mask array, if this is the first time we've
8225      used it.  */
8226   if (rgm->masks.is_empty ())
8227     {
8228       rgm->masks.safe_grow_cleared (nvectors);
8229       for (unsigned int i = 0; i < nvectors; ++i)
8230         {
8231           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8232           /* Provide a dummy definition until the real one is available.  */
8233           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8234           rgm->masks[i] = mask;
8235         }
8236     }
8237
8238   tree mask = rgm->masks[index];
8239   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8240                 TYPE_VECTOR_SUBPARTS (vectype)))
8241     {
8242       /* A loop mask for data type X can be reused for data type Y
8243          if X has N times more elements than Y and if Y's elements
8244          are N times bigger than X's.  In this case each sequence
8245          of N elements in the loop mask will be all-zero or all-one.
8246          We can then view-convert the mask so that each sequence of
8247          N elements is replaced by a single element.  */
8248       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8249                               TYPE_VECTOR_SUBPARTS (vectype)));
8250       gimple_seq seq = NULL;
8251       mask_type = build_same_sized_truth_vector_type (vectype);
8252       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8253       if (seq)
8254         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8255     }
8256   return mask;
8257 }
8258
8259 /* Scale profiling counters by estimation for LOOP which is vectorized
8260    by factor VF.  */
8261
8262 static void
8263 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8264 {
8265   edge preheader = loop_preheader_edge (loop);
8266   /* Reduce loop iterations by the vectorization factor.  */
8267   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8268   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8269
8270   if (freq_h.nonzero_p ())
8271     {
8272       profile_probability p;
8273
8274       /* Avoid dropping loop body profile counter to 0 because of zero count
8275          in loop's preheader.  */
8276       if (!(freq_e == profile_count::zero ()))
8277         freq_e = freq_e.force_nonzero ();
8278       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8279       scale_loop_frequencies (loop, p);
8280     }
8281
8282   edge exit_e = single_exit (loop);
8283   exit_e->probability = profile_probability::always ()
8284                                  .apply_scale (1, new_est_niter + 1);
8285
8286   edge exit_l = single_pred_edge (loop->latch);
8287   profile_probability prob = exit_l->probability;
8288   exit_l->probability = exit_e->probability.invert ();
8289   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8290     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8291 }
8292
8293 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8294    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8295    *SLP_SCHEDULE is a running record of whether we have called
8296    vect_schedule_slp.  */
8297
8298 static void
8299 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8300                           gimple_stmt_iterator *gsi,
8301                           stmt_vec_info *seen_store, bool *slp_scheduled)
8302 {
8303   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8304   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8305   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8306   if (!stmt_info)
8307     return;
8308
8309   if (dump_enabled_p ())
8310     {
8311       dump_printf_loc (MSG_NOTE, vect_location,
8312                        "------>vectorizing statement: ");
8313       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8314     }
8315
8316   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8317     vect_loop_kill_debug_uses (loop, stmt);
8318
8319   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8320       && !STMT_VINFO_LIVE_P (stmt_info))
8321     return;
8322
8323   if (STMT_VINFO_VECTYPE (stmt_info))
8324     {
8325       poly_uint64 nunits
8326         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8327       if (!STMT_SLP_TYPE (stmt_info)
8328           && maybe_ne (nunits, vf)
8329           && dump_enabled_p ())
8330         /* For SLP VF is set according to unrolling factor, and not
8331            to vector size, hence for SLP this print is not valid.  */
8332         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8333     }
8334
8335   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8336      reached.  */
8337   if (STMT_SLP_TYPE (stmt_info))
8338     {
8339       if (!*slp_scheduled)
8340         {
8341           *slp_scheduled = true;
8342
8343           DUMP_VECT_SCOPE ("scheduling SLP instances");
8344
8345           vect_schedule_slp (loop_vinfo);
8346         }
8347
8348       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8349       if (PURE_SLP_STMT (stmt_info))
8350         return;
8351     }
8352
8353   if (dump_enabled_p ())
8354     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8355
8356   bool grouped_store = false;
8357   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8358     *seen_store = stmt_info;
8359 }
8360
8361 /* Function vect_transform_loop.
8362
8363    The analysis phase has determined that the loop is vectorizable.
8364    Vectorize the loop - created vectorized stmts to replace the scalar
8365    stmts in the loop, and update the loop exit condition.
8366    Returns scalar epilogue loop if any.  */
8367
8368 struct loop *
8369 vect_transform_loop (loop_vec_info loop_vinfo)
8370 {
8371   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8372   struct loop *epilogue = NULL;
8373   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8374   int nbbs = loop->num_nodes;
8375   int i;
8376   tree niters_vector = NULL_TREE;
8377   tree step_vector = NULL_TREE;
8378   tree niters_vector_mult_vf = NULL_TREE;
8379   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8380   unsigned int lowest_vf = constant_lower_bound (vf);
8381   bool slp_scheduled = false;
8382   gimple *stmt;
8383   bool check_profitability = false;
8384   unsigned int th;
8385
8386   DUMP_VECT_SCOPE ("vec_transform_loop");
8387
8388   /* Use the more conservative vectorization threshold.  If the number
8389      of iterations is constant assume the cost check has been performed
8390      by our caller.  If the threshold makes all loops profitable that
8391      run at least the (estimated) vectorization factor number of times
8392      checking is pointless, too.  */
8393   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8394   if (th >= vect_vf_for_cost (loop_vinfo)
8395       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8396     {
8397       if (dump_enabled_p ())
8398         dump_printf_loc (MSG_NOTE, vect_location,
8399                          "Profitability threshold is %d loop iterations.\n",
8400                          th);
8401       check_profitability = true;
8402     }
8403
8404   /* Make sure there exists a single-predecessor exit bb.  Do this before
8405      versioning.   */
8406   edge e = single_exit (loop);
8407   if (! single_pred_p (e->dest))
8408     {
8409       split_loop_exit_edge (e);
8410       if (dump_enabled_p ())
8411         dump_printf (MSG_NOTE, "split exit edge\n");
8412     }
8413
8414   /* Version the loop first, if required, so the profitability check
8415      comes first.  */
8416
8417   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8418     {
8419       poly_uint64 versioning_threshold
8420         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8421       if (check_profitability
8422           && ordered_p (poly_uint64 (th), versioning_threshold))
8423         {
8424           versioning_threshold = ordered_max (poly_uint64 (th),
8425                                               versioning_threshold);
8426           check_profitability = false;
8427         }
8428       vect_loop_versioning (loop_vinfo, th, check_profitability,
8429                             versioning_threshold);
8430       check_profitability = false;
8431     }
8432
8433   /* Make sure there exists a single-predecessor exit bb also on the
8434      scalar loop copy.  Do this after versioning but before peeling
8435      so CFG structure is fine for both scalar and if-converted loop
8436      to make slpeel_duplicate_current_defs_from_edges face matched
8437      loop closed PHI nodes on the exit.  */
8438   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8439     {
8440       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8441       if (! single_pred_p (e->dest))
8442         {
8443           split_loop_exit_edge (e);
8444           if (dump_enabled_p ())
8445             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8446         }
8447     }
8448
8449   tree niters = vect_build_loop_niters (loop_vinfo);
8450   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8451   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8452   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8453   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8454                               &step_vector, &niters_vector_mult_vf, th,
8455                               check_profitability, niters_no_overflow);
8456
8457   if (niters_vector == NULL_TREE)
8458     {
8459       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8460           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8461           && known_eq (lowest_vf, vf))
8462         {
8463           niters_vector
8464             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8465                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8466           step_vector = build_one_cst (TREE_TYPE (niters));
8467         }
8468       else
8469         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8470                                      &step_vector, niters_no_overflow);
8471     }
8472
8473   /* 1) Make sure the loop header has exactly two entries
8474      2) Make sure we have a preheader basic block.  */
8475
8476   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8477
8478   split_edge (loop_preheader_edge (loop));
8479
8480   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8481       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8482     /* This will deal with any possible peeling.  */
8483     vect_prepare_for_masked_peels (loop_vinfo);
8484
8485   /* FORNOW: the vectorizer supports only loops which body consist
8486      of one basic block (header + empty latch). When the vectorizer will
8487      support more involved loop forms, the order by which the BBs are
8488      traversed need to be reconsidered.  */
8489
8490   for (i = 0; i < nbbs; i++)
8491     {
8492       basic_block bb = bbs[i];
8493       stmt_vec_info stmt_info;
8494
8495       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8496            gsi_next (&si))
8497         {
8498           gphi *phi = si.phi ();
8499           if (dump_enabled_p ())
8500             {
8501               dump_printf_loc (MSG_NOTE, vect_location,
8502                                "------>vectorizing phi: ");
8503               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8504             }
8505           stmt_info = vinfo_for_stmt (phi);
8506           if (!stmt_info)
8507             continue;
8508
8509           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8510             vect_loop_kill_debug_uses (loop, phi);
8511
8512           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8513               && !STMT_VINFO_LIVE_P (stmt_info))
8514             continue;
8515
8516           if (STMT_VINFO_VECTYPE (stmt_info)
8517               && (maybe_ne
8518                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8519               && dump_enabled_p ())
8520             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8521
8522           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8523                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8524                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8525               && ! PURE_SLP_STMT (stmt_info))
8526             {
8527               if (dump_enabled_p ())
8528                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8529               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8530             }
8531         }
8532
8533       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8534            !gsi_end_p (si);)
8535         {
8536           stmt = gsi_stmt (si);
8537           /* During vectorization remove existing clobber stmts.  */
8538           if (gimple_clobber_p (stmt))
8539             {
8540               unlink_stmt_vdef (stmt);
8541               gsi_remove (&si, true);
8542               release_defs (stmt);
8543             }
8544           else
8545             {
8546               stmt_info = vinfo_for_stmt (stmt);
8547
8548               /* vector stmts created in the outer-loop during vectorization of
8549                  stmts in an inner-loop may not have a stmt_info, and do not
8550                  need to be vectorized.  */
8551               stmt_vec_info seen_store = NULL;
8552               if (stmt_info)
8553                 {
8554                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8555                     {
8556                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8557                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8558                            !gsi_end_p (subsi); gsi_next (&subsi))
8559                         vect_transform_loop_stmt (loop_vinfo,
8560                                                   gsi_stmt (subsi), &si,
8561                                                   &seen_store,
8562                                                   &slp_scheduled);
8563                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8564                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8565                                                 &seen_store, &slp_scheduled);
8566                     }
8567                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8568                                             &seen_store, &slp_scheduled);
8569                 }
8570               if (seen_store)
8571                 {
8572                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8573                     {
8574                       /* Interleaving.  If IS_STORE is TRUE, the
8575                          vectorization of the interleaving chain was
8576                          completed - free all the stores in the chain.  */
8577                       gsi_next (&si);
8578                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8579                     }
8580                   else
8581                     {
8582                       /* Free the attached stmt_vec_info and remove the
8583                          stmt.  */
8584                       free_stmt_vec_info (stmt);
8585                       unlink_stmt_vdef (stmt);
8586                       gsi_remove (&si, true);
8587                       release_defs (stmt);
8588                     }
8589                 }
8590               else
8591                 gsi_next (&si);
8592             }
8593         }
8594
8595       /* Stub out scalar statements that must not survive vectorization.
8596          Doing this here helps with grouped statements, or statements that
8597          are involved in patterns.  */
8598       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8599            !gsi_end_p (gsi); gsi_next (&gsi))
8600         {
8601           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8602           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8603             {
8604               tree lhs = gimple_get_lhs (call);
8605               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8606                 {
8607                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8608                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8609                   gsi_replace (&gsi, new_stmt, true);
8610                 }
8611             }
8612         }
8613     }                           /* BBs in loop */
8614
8615   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8616      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8617   if (integer_onep (step_vector))
8618     niters_no_overflow = true;
8619   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8620                            niters_vector_mult_vf, !niters_no_overflow);
8621
8622   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8623   scale_profile_for_vect_loop (loop, assumed_vf);
8624
8625   /* True if the final iteration might not handle a full vector's
8626      worth of scalar iterations.  */
8627   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8628   /* The minimum number of iterations performed by the epilogue.  This
8629      is 1 when peeling for gaps because we always need a final scalar
8630      iteration.  */
8631   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8632   /* +1 to convert latch counts to loop iteration counts,
8633      -min_epilogue_iters to remove iterations that cannot be performed
8634        by the vector code.  */
8635   int bias_for_lowest = 1 - min_epilogue_iters;
8636   int bias_for_assumed = bias_for_lowest;
8637   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8638   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8639     {
8640       /* When the amount of peeling is known at compile time, the first
8641          iteration will have exactly alignment_npeels active elements.
8642          In the worst case it will have at least one.  */
8643       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8644       bias_for_lowest += lowest_vf - min_first_active;
8645       bias_for_assumed += assumed_vf - min_first_active;
8646     }
8647   /* In these calculations the "- 1" converts loop iteration counts
8648      back to latch counts.  */
8649   if (loop->any_upper_bound)
8650     loop->nb_iterations_upper_bound
8651       = (final_iter_may_be_partial
8652          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8653                           lowest_vf) - 1
8654          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8655                            lowest_vf) - 1);
8656   if (loop->any_likely_upper_bound)
8657     loop->nb_iterations_likely_upper_bound
8658       = (final_iter_may_be_partial
8659          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8660                           + bias_for_lowest, lowest_vf) - 1
8661          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8662                            + bias_for_lowest, lowest_vf) - 1);
8663   if (loop->any_estimate)
8664     loop->nb_iterations_estimate
8665       = (final_iter_may_be_partial
8666          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8667                           assumed_vf) - 1
8668          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8669                            assumed_vf) - 1);
8670
8671   if (dump_enabled_p ())
8672     {
8673       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8674         {
8675           dump_printf_loc (MSG_NOTE, vect_location,
8676                            "LOOP VECTORIZED\n");
8677           if (loop->inner)
8678             dump_printf_loc (MSG_NOTE, vect_location,
8679                              "OUTER LOOP VECTORIZED\n");
8680           dump_printf (MSG_NOTE, "\n");
8681         }
8682       else
8683         {
8684           dump_printf_loc (MSG_NOTE, vect_location,
8685                            "LOOP EPILOGUE VECTORIZED (VS=");
8686           dump_dec (MSG_NOTE, current_vector_size);
8687           dump_printf (MSG_NOTE, ")\n");
8688         }
8689     }
8690
8691   /* Free SLP instances here because otherwise stmt reference counting
8692      won't work.  */
8693   slp_instance instance;
8694   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8695     vect_free_slp_instance (instance);
8696   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8697   /* Clear-up safelen field since its value is invalid after vectorization
8698      since vectorized loop can have loop-carried dependencies.  */
8699   loop->safelen = 0;
8700
8701   /* Don't vectorize epilogue for epilogue.  */
8702   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8703     epilogue = NULL;
8704
8705   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8706     epilogue = NULL;
8707
8708   if (epilogue)
8709     {
8710       auto_vector_sizes vector_sizes;
8711       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8712       unsigned int next_size = 0;
8713
8714       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8715           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8716           && known_eq (vf, lowest_vf))
8717         {
8718           unsigned int eiters
8719             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8720                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8721           eiters = eiters % lowest_vf;
8722           epilogue->nb_iterations_upper_bound = eiters - 1;
8723
8724           unsigned int ratio;
8725           while (next_size < vector_sizes.length ()
8726                  && !(constant_multiple_p (current_vector_size,
8727                                            vector_sizes[next_size], &ratio)
8728                       && eiters >= lowest_vf / ratio))
8729             next_size += 1;
8730         }
8731       else
8732         while (next_size < vector_sizes.length ()
8733                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8734           next_size += 1;
8735
8736       if (next_size == vector_sizes.length ())
8737         epilogue = NULL;
8738     }
8739
8740   if (epilogue)
8741     {
8742       epilogue->force_vectorize = loop->force_vectorize;
8743       epilogue->safelen = loop->safelen;
8744       epilogue->dont_vectorize = false;
8745
8746       /* We may need to if-convert epilogue to vectorize it.  */
8747       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8748         tree_if_conversion (epilogue);
8749     }
8750
8751   return epilogue;
8752 }
8753
8754 /* The code below is trying to perform simple optimization - revert
8755    if-conversion for masked stores, i.e. if the mask of a store is zero
8756    do not perform it and all stored value producers also if possible.
8757    For example,
8758      for (i=0; i<n; i++)
8759        if (c[i])
8760         {
8761           p1[i] += 1;
8762           p2[i] = p3[i] +2;
8763         }
8764    this transformation will produce the following semi-hammock:
8765
8766    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8767      {
8768        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8769        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8770        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8771        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8772        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8773        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8774      }
8775 */
8776
8777 void
8778 optimize_mask_stores (struct loop *loop)
8779 {
8780   basic_block *bbs = get_loop_body (loop);
8781   unsigned nbbs = loop->num_nodes;
8782   unsigned i;
8783   basic_block bb;
8784   struct loop *bb_loop;
8785   gimple_stmt_iterator gsi;
8786   gimple *stmt;
8787   auto_vec<gimple *> worklist;
8788
8789   vect_location = find_loop_location (loop);
8790   /* Pick up all masked stores in loop if any.  */
8791   for (i = 0; i < nbbs; i++)
8792     {
8793       bb = bbs[i];
8794       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8795            gsi_next (&gsi))
8796         {
8797           stmt = gsi_stmt (gsi);
8798           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8799             worklist.safe_push (stmt);
8800         }
8801     }
8802
8803   free (bbs);
8804   if (worklist.is_empty ())
8805     return;
8806
8807   /* Loop has masked stores.  */
8808   while (!worklist.is_empty ())
8809     {
8810       gimple *last, *last_store;
8811       edge e, efalse;
8812       tree mask;
8813       basic_block store_bb, join_bb;
8814       gimple_stmt_iterator gsi_to;
8815       tree vdef, new_vdef;
8816       gphi *phi;
8817       tree vectype;
8818       tree zero;
8819
8820       last = worklist.pop ();
8821       mask = gimple_call_arg (last, 2);
8822       bb = gimple_bb (last);
8823       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8824          the same loop as if_bb.  It could be different to LOOP when two
8825          level loop-nest is vectorized and mask_store belongs to the inner
8826          one.  */
8827       e = split_block (bb, last);
8828       bb_loop = bb->loop_father;
8829       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8830       join_bb = e->dest;
8831       store_bb = create_empty_bb (bb);
8832       add_bb_to_loop (store_bb, bb_loop);
8833       e->flags = EDGE_TRUE_VALUE;
8834       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8835       /* Put STORE_BB to likely part.  */
8836       efalse->probability = profile_probability::unlikely ();
8837       store_bb->count = efalse->count ();
8838       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8839       if (dom_info_available_p (CDI_DOMINATORS))
8840         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8841       if (dump_enabled_p ())
8842         dump_printf_loc (MSG_NOTE, vect_location,
8843                          "Create new block %d to sink mask stores.",
8844                          store_bb->index);
8845       /* Create vector comparison with boolean result.  */
8846       vectype = TREE_TYPE (mask);
8847       zero = build_zero_cst (vectype);
8848       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8849       gsi = gsi_last_bb (bb);
8850       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8851       /* Create new PHI node for vdef of the last masked store:
8852          .MEM_2 = VDEF <.MEM_1>
8853          will be converted to
8854          .MEM.3 = VDEF <.MEM_1>
8855          and new PHI node will be created in join bb
8856          .MEM_2 = PHI <.MEM_1, .MEM_3>
8857       */
8858       vdef = gimple_vdef (last);
8859       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8860       gimple_set_vdef (last, new_vdef);
8861       phi = create_phi_node (vdef, join_bb);
8862       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8863
8864       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8865       while (true)
8866         {
8867           gimple_stmt_iterator gsi_from;
8868           gimple *stmt1 = NULL;
8869
8870           /* Move masked store to STORE_BB.  */
8871           last_store = last;
8872           gsi = gsi_for_stmt (last);
8873           gsi_from = gsi;
8874           /* Shift GSI to the previous stmt for further traversal.  */
8875           gsi_prev (&gsi);
8876           gsi_to = gsi_start_bb (store_bb);
8877           gsi_move_before (&gsi_from, &gsi_to);
8878           /* Setup GSI_TO to the non-empty block start.  */
8879           gsi_to = gsi_start_bb (store_bb);
8880           if (dump_enabled_p ())
8881             {
8882               dump_printf_loc (MSG_NOTE, vect_location,
8883                                "Move stmt to created bb\n");
8884               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8885             }
8886           /* Move all stored value producers if possible.  */
8887           while (!gsi_end_p (gsi))
8888             {
8889               tree lhs;
8890               imm_use_iterator imm_iter;
8891               use_operand_p use_p;
8892               bool res;
8893
8894               /* Skip debug statements.  */
8895               if (is_gimple_debug (gsi_stmt (gsi)))
8896                 {
8897                   gsi_prev (&gsi);
8898                   continue;
8899                 }
8900               stmt1 = gsi_stmt (gsi);
8901               /* Do not consider statements writing to memory or having
8902                  volatile operand.  */
8903               if (gimple_vdef (stmt1)
8904                   || gimple_has_volatile_ops (stmt1))
8905                 break;
8906               gsi_from = gsi;
8907               gsi_prev (&gsi);
8908               lhs = gimple_get_lhs (stmt1);
8909               if (!lhs)
8910                 break;
8911
8912               /* LHS of vectorized stmt must be SSA_NAME.  */
8913               if (TREE_CODE (lhs) != SSA_NAME)
8914                 break;
8915
8916               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8917                 {
8918                   /* Remove dead scalar statement.  */
8919                   if (has_zero_uses (lhs))
8920                     {
8921                       gsi_remove (&gsi_from, true);
8922                       continue;
8923                     }
8924                 }
8925
8926               /* Check that LHS does not have uses outside of STORE_BB.  */
8927               res = true;
8928               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8929                 {
8930                   gimple *use_stmt;
8931                   use_stmt = USE_STMT (use_p);
8932                   if (is_gimple_debug (use_stmt))
8933                     continue;
8934                   if (gimple_bb (use_stmt) != store_bb)
8935                     {
8936                       res = false;
8937                       break;
8938                     }
8939                 }
8940               if (!res)
8941                 break;
8942
8943               if (gimple_vuse (stmt1)
8944                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8945                 break;
8946
8947               /* Can move STMT1 to STORE_BB.  */
8948               if (dump_enabled_p ())
8949                 {
8950                   dump_printf_loc (MSG_NOTE, vect_location,
8951                                    "Move stmt to created bb\n");
8952                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8953                 }
8954               gsi_move_before (&gsi_from, &gsi_to);
8955               /* Shift GSI_TO for further insertion.  */
8956               gsi_prev (&gsi_to);
8957             }
8958           /* Put other masked stores with the same mask to STORE_BB.  */
8959           if (worklist.is_empty ()
8960               || gimple_call_arg (worklist.last (), 2) != mask
8961               || worklist.last () != stmt1)
8962             break;
8963           last = worklist.pop ();
8964         }
8965       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8966     }
8967 }