gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   if (dump_enabled_p ())
 217     {
 218       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 219       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 220     }
 221   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 222     return false;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 228       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 229
 230       /* If a pattern statement has def stmts, analyze them too.  */
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             {
 237               dump_printf_loc (MSG_NOTE, vect_location,
 238                                "==> examining pattern def stmt: ");
 239               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 240                                 def_stmt_info->stmt, 0);
 241             }
 242           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 243                                              vf, mask_producers))
 244             return false;
 245         }
 246
 247       if (dump_enabled_p ())
 248         {
 249           dump_printf_loc (MSG_NOTE, vect_location,
 250                            "==> examining pattern statement: ");
 251           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 252         }
 253       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 254         return false;
 255     }
 256
 257   return true;
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static bool
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297   auto_vec<stmt_vec_info> mask_producers;
 298
 299   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 300
 301   for (i = 0; i < nbbs; i++)
 302     {
 303       basic_block bb = bbs[i];
 304
 305       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 306            gsi_next (&si))
 307         {
 308           phi = si.phi ();
 309           stmt_info = vinfo_for_stmt (phi);
 310           if (dump_enabled_p ())
 311             {
 312               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 313               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 314             }
 315
 316           gcc_assert (stmt_info);
 317
 318           if (STMT_VINFO_RELEVANT_P (stmt_info)
 319               || STMT_VINFO_LIVE_P (stmt_info))
 320             {
 321               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 322               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 323
 324               if (dump_enabled_p ())
 325                 {
 326                   dump_printf_loc (MSG_NOTE, vect_location,
 327                                    "get vectype for scalar type:  ");
 328                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 329                   dump_printf (MSG_NOTE, "\n");
 330                 }
 331
 332               vectype = get_vectype_for_scalar_type (scalar_type);
 333               if (!vectype)
 334                 {
 335                   if (dump_enabled_p ())
 336                     {
 337                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 338                                        "not vectorized: unsupported "
 339                                        "data-type ");
 340                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 341                                          scalar_type);
 342                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 343                     }
 344                   return false;
 345                 }
 346               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 347
 348               if (dump_enabled_p ())
 349                 {
 350                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 351                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 352                   dump_printf (MSG_NOTE, "\n");
 353                 }
 354
 355               if (dump_enabled_p ())
 356                 {
 357                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 358                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 359                   dump_printf (MSG_NOTE, "\n");
 360                 }
 361
 362               vect_update_max_nunits (&vectorization_factor, vectype);
 363             }
 364         }
 365
 366       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 367            gsi_next (&si))
 368         {
 369           stmt_info = vinfo_for_stmt (gsi_stmt (si));
 370           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 371                                            &mask_producers))
 372             return false;
 373         }
 374     }
 375
 376   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 377   if (dump_enabled_p ())
 378     {
 379       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 380       dump_dec (MSG_NOTE, vectorization_factor);
 381       dump_printf (MSG_NOTE, "\n");
 382     }
 383
 384   if (known_le (vectorization_factor, 1U))
 385     {
 386       if (dump_enabled_p ())
 387         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 388                          "not vectorized: unsupported data-type\n");
 389       return false;
 390     }
 391   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 392
 393   for (i = 0; i < mask_producers.length (); i++)
 394     {
 395       stmt_info = mask_producers[i];
 396       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 397       if (!mask_type)
 398         return false;
 399       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 400     }
 401
 402   return true;
 403 }
 404
 405
 406 /* Function vect_is_simple_iv_evolution.
 407
 408    FORNOW: A simple evolution of an induction variables in the loop is
 409    considered a polynomial evolution.  */
 410
 411 static bool
 412 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 413                              tree * step)
 414 {
 415   tree init_expr;
 416   tree step_expr;
 417   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 418   basic_block bb;
 419
 420   /* When there is no evolution in this loop, the evolution function
 421      is not "simple".  */
 422   if (evolution_part == NULL_TREE)
 423     return false;
 424
 425   /* When the evolution is a polynomial of degree >= 2
 426      the evolution function is not "simple".  */
 427   if (tree_is_chrec (evolution_part))
 428     return false;
 429
 430   step_expr = evolution_part;
 431   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 432
 433   if (dump_enabled_p ())
 434     {
 435       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 436       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 437       dump_printf (MSG_NOTE, ",  init: ");
 438       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 439       dump_printf (MSG_NOTE, "\n");
 440     }
 441
 442   *init = init_expr;
 443   *step = step_expr;
 444
 445   if (TREE_CODE (step_expr) != INTEGER_CST
 446       && (TREE_CODE (step_expr) != SSA_NAME
 447           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 448               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 449           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 450               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 451                   || !flag_associative_math)))
 452       && (TREE_CODE (step_expr) != REAL_CST
 453           || !flag_associative_math))
 454     {
 455       if (dump_enabled_p ())
 456         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 457                          "step unknown.\n");
 458       return false;
 459     }
 460
 461   return true;
 462 }
 463
 464 /* Function vect_analyze_scalar_cycles_1.
 465
 466    Examine the cross iteration def-use cycles of scalar variables
 467    in LOOP.  LOOP_VINFO represents the loop that is now being
 468    considered for vectorization (can be LOOP, or an outer-loop
 469    enclosing LOOP).  */
 470
 471 static void
 472 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 473 {
 474   basic_block bb = loop->header;
 475   tree init, step;
 476   auto_vec<gimple *, 64> worklist;
 477   gphi_iterator gsi;
 478   bool double_reduc;
 479
 480   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 481
 482   /* First - identify all inductions.  Reduction detection assumes that all the
 483      inductions have been identified, therefore, this order must not be
 484      changed.  */
 485   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 486     {
 487       gphi *phi = gsi.phi ();
 488       tree access_fn = NULL;
 489       tree def = PHI_RESULT (phi);
 490       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 491
 492       if (dump_enabled_p ())
 493         {
 494           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 495           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 496         }
 497
 498       /* Skip virtual phi's.  The data dependences that are associated with
 499          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 500       if (virtual_operand_p (def))
 501         continue;
 502
 503       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 504
 505       /* Analyze the evolution function.  */
 506       access_fn = analyze_scalar_evolution (loop, def);
 507       if (access_fn)
 508         {
 509           STRIP_NOPS (access_fn);
 510           if (dump_enabled_p ())
 511             {
 512               dump_printf_loc (MSG_NOTE, vect_location,
 513                                "Access function of PHI: ");
 514               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 515               dump_printf (MSG_NOTE, "\n");
 516             }
 517           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 518             = initial_condition_in_loop_num (access_fn, loop->num);
 519           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 520             = evolution_part_in_loop_num (access_fn, loop->num);
 521         }
 522
 523       if (!access_fn
 524           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 525           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 526               && TREE_CODE (step) != INTEGER_CST))
 527         {
 528           worklist.safe_push (phi);
 529           continue;
 530         }
 531
 532       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 533                   != NULL_TREE);
 534       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 535
 536       if (dump_enabled_p ())
 537         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 538       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 539     }
 540
 541
 542   /* Second - identify all reductions and nested cycles.  */
 543   while (worklist.length () > 0)
 544     {
 545       gimple *phi = worklist.pop ();
 546       tree def = PHI_RESULT (phi);
 547       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 548       gimple *reduc_stmt;
 549
 550       if (dump_enabled_p ())
 551         {
 552           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 553           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 554         }
 555
 556       gcc_assert (!virtual_operand_p (def)
 557                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 558
 559       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 560                                                 &double_reduc, false);
 561       if (reduc_stmt)
 562         {
 563           if (double_reduc)
 564             {
 565               if (dump_enabled_p ())
 566                 dump_printf_loc (MSG_NOTE, vect_location,
 567                                  "Detected double reduction.\n");
 568
 569               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 570               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 571                                                     vect_double_reduction_def;
 572             }
 573           else
 574             {
 575               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 576                 {
 577                   if (dump_enabled_p ())
 578                     dump_printf_loc (MSG_NOTE, vect_location,
 579                                      "Detected vectorizable nested cycle.\n");
 580
 581                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 582                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 583                                                              vect_nested_cycle;
 584                 }
 585               else
 586                 {
 587                   if (dump_enabled_p ())
 588                     dump_printf_loc (MSG_NOTE, vect_location,
 589                                      "Detected reduction.\n");
 590
 591                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 592                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 593                                                            vect_reduction_def;
 594                   /* Store the reduction cycles for possible vectorization in
 595                      loop-aware SLP if it was not detected as reduction
 596                      chain.  */
 597                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 598                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 599                 }
 600             }
 601         }
 602       else
 603         if (dump_enabled_p ())
 604           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 605                            "Unknown def-use cycle pattern.\n");
 606     }
 607 }
 608
 609
 610 /* Function vect_analyze_scalar_cycles.
 611
 612    Examine the cross iteration def-use cycles of scalar variables, by
 613    analyzing the loop-header PHIs of scalar variables.  Classify each
 614    cycle as one of the following: invariant, induction, reduction, unknown.
 615    We do that for the loop represented by LOOP_VINFO, and also to its
 616    inner-loop, if exists.
 617    Examples for scalar cycles:
 618
 619    Example1: reduction:
 620
 621               loop1:
 622               for (i=0; i<N; i++)
 623                  sum += a[i];
 624
 625    Example2: induction:
 626
 627               loop2:
 628               for (i=0; i<N; i++)
 629                  a[i] = i;  */
 630
 631 static void
 632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 633 {
 634   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 635
 636   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 637
 638   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 639      Reductions in such inner-loop therefore have different properties than
 640      the reductions in the nest that gets vectorized:
 641      1. When vectorized, they are executed in the same order as in the original
 642         scalar loop, so we can't change the order of computation when
 643         vectorizing them.
 644      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 645         current checks are too strict.  */
 646
 647   if (loop->inner)
 648     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 649 }
 650
 651 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 652
 653 static void
 654 vect_fixup_reduc_chain (gimple *stmt)
 655 {
 656   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 657   gimple *stmtp;
 658   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 659               && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 660   REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
 661     = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
 662   do
 663     {
 664       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 665       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 666       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 667       if (stmt)
 668         REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 669           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 670     }
 671   while (stmt);
 672   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 673 }
 674
 675 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 676
 677 static void
 678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 679 {
 680   gimple *first;
 681   unsigned i;
 682
 683   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 684     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 685       {
 686         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 687         while (next)
 688           {
 689             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 690               break;
 691             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 692           }
 693         /* If not all stmt in the chain are patterns try to handle
 694            the chain without patterns.  */
 695         if (! next)
 696           {
 697             vect_fixup_reduc_chain (first);
 698             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 699               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 700           }
 701       }
 702 }
 703
 704 /* Function vect_get_loop_niters.
 705
 706    Determine how many iterations the loop is executed and place it
 707    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 708    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 709    niter information holds in ASSUMPTIONS.
 710
 711    Return the loop exit condition.  */
 712
 713
 714 static gcond *
 715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 716                       tree *number_of_iterations, tree *number_of_iterationsm1)
 717 {
 718   edge exit = single_exit (loop);
 719   struct tree_niter_desc niter_desc;
 720   tree niter_assumptions, niter, may_be_zero;
 721   gcond *cond = get_loop_exit_condition (loop);
 722
 723   *assumptions = boolean_true_node;
 724   *number_of_iterationsm1 = chrec_dont_know;
 725   *number_of_iterations = chrec_dont_know;
 726   DUMP_VECT_SCOPE ("get_loop_niters");
 727
 728   if (!exit)
 729     return cond;
 730
 731   niter = chrec_dont_know;
 732   may_be_zero = NULL_TREE;
 733   niter_assumptions = boolean_true_node;
 734   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 735       || chrec_contains_undetermined (niter_desc.niter))
 736     return cond;
 737
 738   niter_assumptions = niter_desc.assumptions;
 739   may_be_zero = niter_desc.may_be_zero;
 740   niter = niter_desc.niter;
 741
 742   if (may_be_zero && integer_zerop (may_be_zero))
 743     may_be_zero = NULL_TREE;
 744
 745   if (may_be_zero)
 746     {
 747       if (COMPARISON_CLASS_P (may_be_zero))
 748         {
 749           /* Try to combine may_be_zero with assumptions, this can simplify
 750              computation of niter expression.  */
 751           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 752             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 753                                              niter_assumptions,
 754                                              fold_build1 (TRUTH_NOT_EXPR,
 755                                                           boolean_type_node,
 756                                                           may_be_zero));
 757           else
 758             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 759                                  build_int_cst (TREE_TYPE (niter), 0),
 760                                  rewrite_to_non_trapping_overflow (niter));
 761
 762           may_be_zero = NULL_TREE;
 763         }
 764       else if (integer_nonzerop (may_be_zero))
 765         {
 766           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 767           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 768           return cond;
 769         }
 770       else
 771         return cond;
 772     }
 773
 774   *assumptions = niter_assumptions;
 775   *number_of_iterationsm1 = niter;
 776
 777   /* We want the number of loop header executions which is the number
 778      of latch executions plus one.
 779      ???  For UINT_MAX latch executions this number overflows to zero
 780      for loops like do { n++; } while (n != 0);  */
 781   if (niter && !chrec_contains_undetermined (niter))
 782     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 783                           build_int_cst (TREE_TYPE (niter), 1));
 784   *number_of_iterations = niter;
 785
 786   return cond;
 787 }
 788
 789 /* Function bb_in_loop_p
 790
 791    Used as predicate for dfs order traversal of the loop bbs.  */
 792
 793 static bool
 794 bb_in_loop_p (const_basic_block bb, const void *data)
 795 {
 796   const struct loop *const loop = (const struct loop *)data;
 797   if (flow_bb_inside_loop_p (loop, bb))
 798     return true;
 799   return false;
 800 }
 801
 802
 803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 804    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 805
 806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 807   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 808     loop (loop_in),
 809     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 810     num_itersm1 (NULL_TREE),
 811     num_iters (NULL_TREE),
 812     num_iters_unchanged (NULL_TREE),
 813     num_iters_assumptions (NULL_TREE),
 814     th (0),
 815     versioning_threshold (0),
 816     vectorization_factor (0),
 817     max_vectorization_factor (0),
 818     mask_skip_niters (NULL_TREE),
 819     mask_compare_type (NULL_TREE),
 820     unaligned_dr (NULL),
 821     peeling_for_alignment (0),
 822     ptr_mask (0),
 823     ivexpr_map (NULL),
 824     slp_unrolling_factor (1),
 825     single_scalar_iteration_cost (0),
 826     vectorizable (false),
 827     can_fully_mask_p (true),
 828     fully_masked_p (false),
 829     peeling_for_gaps (false),
 830     peeling_for_niter (false),
 831     operands_swapped (false),
 832     no_data_dependencies (false),
 833     has_mask_store (false),
 834     scalar_loop (NULL),
 835     orig_loop_info (NULL)
 836 {
 837   /* Create/Update stmt_info for all stmts in the loop.  */
 838   basic_block *body = get_loop_body (loop);
 839   for (unsigned int i = 0; i < loop->num_nodes; i++)
 840     {
 841       basic_block bb = body[i];
 842       gimple_stmt_iterator si;
 843
 844       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 845         {
 846           gimple *phi = gsi_stmt (si);
 847           gimple_set_uid (phi, 0);
 848           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
 849         }
 850
 851       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 852         {
 853           gimple *stmt = gsi_stmt (si);
 854           gimple_set_uid (stmt, 0);
 855           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
 856         }
 857     }
 858   free (body);
 859
 860   /* CHECKME: We want to visit all BBs before their successors (except for
 861      latch blocks, for which this assertion wouldn't hold).  In the simple
 862      case of the loop forms we allow, a dfs order of the BBs would the same
 863      as reversed postorder traversal, so we are safe.  */
 864
 865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 866                                           bbs, loop->num_nodes, loop);
 867   gcc_assert (nbbs == loop->num_nodes);
 868 }
 869
 870 /* Free all levels of MASKS.  */
 871
 872 void
 873 release_vec_loop_masks (vec_loop_masks *masks)
 874 {
 875   rgroup_masks *rgm;
 876   unsigned int i;
 877   FOR_EACH_VEC_ELT (*masks, i, rgm)
 878     rgm->masks.release ();
 879   masks->release ();
 880 }
 881
 882 /* Free all memory used by the _loop_vec_info, as well as all the
 883    stmt_vec_info structs of all the stmts in the loop.  */
 884
 885 _loop_vec_info::~_loop_vec_info ()
 886 {
 887   int nbbs;
 888   gimple_stmt_iterator si;
 889   int j;
 890
 891   /* ???  We're releasing loop_vinfos en-block.  */
 892   set_stmt_vec_info_vec (&stmt_vec_infos);
 893   nbbs = loop->num_nodes;
 894   for (j = 0; j < nbbs; j++)
 895     {
 896       basic_block bb = bbs[j];
 897       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 898         free_stmt_vec_info (gsi_stmt (si));
 899
 900       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 901         {
 902           gimple *stmt = gsi_stmt (si);
 903
 904           /* We may have broken canonical form by moving a constant
 905              into RHS1 of a commutative op.  Fix such occurrences.  */
 906           if (operands_swapped && is_gimple_assign (stmt))
 907             {
 908               enum tree_code code = gimple_assign_rhs_code (stmt);
 909
 910               if ((code == PLUS_EXPR
 911                    || code == POINTER_PLUS_EXPR
 912                    || code == MULT_EXPR)
 913                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 914                 swap_ssa_operands (stmt,
 915                                    gimple_assign_rhs1_ptr (stmt),
 916                                    gimple_assign_rhs2_ptr (stmt));
 917               else if (code == COND_EXPR
 918                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 919                 {
 920                   tree cond_expr = gimple_assign_rhs1 (stmt);
 921                   enum tree_code cond_code = TREE_CODE (cond_expr);
 922
 923                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 924                     {
 925                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 926                                                                   0));
 927                       cond_code = invert_tree_comparison (cond_code,
 928                                                           honor_nans);
 929                       if (cond_code != ERROR_MARK)
 930                         {
 931                           TREE_SET_CODE (cond_expr, cond_code);
 932                           swap_ssa_operands (stmt,
 933                                              gimple_assign_rhs2_ptr (stmt),
 934                                              gimple_assign_rhs3_ptr (stmt));
 935                         }
 936                     }
 937                 }
 938             }
 939
 940           /* Free stmt_vec_info.  */
 941           free_stmt_vec_info (stmt);
 942           gsi_next (&si);
 943         }
 944     }
 945
 946   free (bbs);
 947
 948   release_vec_loop_masks (&masks);
 949   delete ivexpr_map;
 950
 951   loop->aux = NULL;
 952 }
 953
 954 /* Return an invariant or register for EXPR and emit necessary
 955    computations in the LOOP_VINFO loop preheader.  */
 956
 957 tree
 958 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 959 {
 960   if (is_gimple_reg (expr)
 961       || is_gimple_min_invariant (expr))
 962     return expr;
 963
 964   if (! loop_vinfo->ivexpr_map)
 965     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 966   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 967   if (! cached)
 968     {
 969       gimple_seq stmts = NULL;
 970       cached = force_gimple_operand (unshare_expr (expr),
 971                                      &stmts, true, NULL_TREE);
 972       if (stmts)
 973         {
 974           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 975           gsi_insert_seq_on_edge_immediate (e, stmts);
 976         }
 977     }
 978   return cached;
 979 }
 980
 981 /* Return true if we can use CMP_TYPE as the comparison type to produce
 982    all masks required to mask LOOP_VINFO.  */
 983
 984 static bool
 985 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 986 {
 987   rgroup_masks *rgm;
 988   unsigned int i;
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 990     if (rgm->mask_type != NULL_TREE
 991         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 992                                             cmp_type, rgm->mask_type,
 993                                             OPTIMIZE_FOR_SPEED))
 994       return false;
 995   return true;
 996 }
 997
 998 /* Calculate the maximum number of scalars per iteration for every
 999    rgroup in LOOP_VINFO.  */
1000
1001 static unsigned int
1002 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1003 {
1004   unsigned int res = 1;
1005   unsigned int i;
1006   rgroup_masks *rgm;
1007   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1008     res = MAX (res, rgm->max_nscalars_per_iter);
1009   return res;
1010 }
1011
1012 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1013    whether we can actually generate the masks required.  Return true if so,
1014    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1015
1016 static bool
1017 vect_verify_full_masking (loop_vec_info loop_vinfo)
1018 {
1019   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1020   unsigned int min_ni_width;
1021
1022   /* Use a normal loop if there are no statements that need masking.
1023      This only happens in rare degenerate cases: it means that the loop
1024      has no loads, no stores, and no live-out values.  */
1025   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1026     return false;
1027
1028   /* Get the maximum number of iterations that is representable
1029      in the counter type.  */
1030   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1031   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1032
1033   /* Get a more refined estimate for the number of iterations.  */
1034   widest_int max_back_edges;
1035   if (max_loop_iterations (loop, &max_back_edges))
1036     max_ni = wi::smin (max_ni, max_back_edges + 1);
1037
1038   /* Account for rgroup masks, in which each bit is replicated N times.  */
1039   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1040
1041   /* Work out how many bits we need to represent the limit.  */
1042   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1043
1044   /* Find a scalar mode for which WHILE_ULT is supported.  */
1045   opt_scalar_int_mode cmp_mode_iter;
1046   tree cmp_type = NULL_TREE;
1047   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1048     {
1049       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1050       if (cmp_bits >= min_ni_width
1051           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1052         {
1053           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1054           if (this_type
1055               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1056             {
1057               /* Although we could stop as soon as we find a valid mode,
1058                  it's often better to continue until we hit Pmode, since the
1059                  operands to the WHILE are more likely to be reusable in
1060                  address calculations.  */
1061               cmp_type = this_type;
1062               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1063                 break;
1064             }
1065         }
1066     }
1067
1068   if (!cmp_type)
1069     return false;
1070
1071   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1072   return true;
1073 }
1074
1075 /* Calculate the cost of one scalar iteration of the loop.  */
1076 static void
1077 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1078 {
1079   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1080   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1081   int nbbs = loop->num_nodes, factor;
1082   int innerloop_iters, i;
1083
1084   /* Gather costs for statements in the scalar loop.  */
1085
1086   /* FORNOW.  */
1087   innerloop_iters = 1;
1088   if (loop->inner)
1089     innerloop_iters = 50; /* FIXME */
1090
1091   for (i = 0; i < nbbs; i++)
1092     {
1093       gimple_stmt_iterator si;
1094       basic_block bb = bbs[i];
1095
1096       if (bb->loop_father == loop->inner)
1097         factor = innerloop_iters;
1098       else
1099         factor = 1;
1100
1101       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1102         {
1103           gimple *stmt = gsi_stmt (si);
1104           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1105
1106           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1107             continue;
1108
1109           /* Skip stmts that are not vectorized inside the loop.  */
1110           if (stmt_info
1111               && !STMT_VINFO_RELEVANT_P (stmt_info)
1112               && (!STMT_VINFO_LIVE_P (stmt_info)
1113                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1114               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1115             continue;
1116
1117           vect_cost_for_stmt kind;
1118           if (STMT_VINFO_DATA_REF (stmt_info))
1119             {
1120               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1121                kind = scalar_load;
1122              else
1123                kind = scalar_store;
1124             }
1125           else
1126             kind = scalar_stmt;
1127
1128           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129                             factor, kind, stmt_info, 0, vect_prologue);
1130         }
1131     }
1132
1133   /* Now accumulate cost.  */
1134   void *target_cost_data = init_cost (loop);
1135   stmt_info_for_cost *si;
1136   int j;
1137   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1138                     j, si)
1139     {
1140       struct _stmt_vec_info *stmt_info
1141         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1142       (void) add_stmt_cost (target_cost_data, si->count,
1143                             si->kind, stmt_info, si->misalign,
1144                             vect_body);
1145     }
1146   unsigned dummy, body_cost = 0;
1147   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1148   destroy_cost_data (target_cost_data);
1149   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1150 }
1151
1152
1153 /* Function vect_analyze_loop_form_1.
1154
1155    Verify that certain CFG restrictions hold, including:
1156    - the loop has a pre-header
1157    - the loop has a single entry and exit
1158    - the loop exit condition is simple enough
1159    - the number of iterations can be analyzed, i.e, a countable loop.  The
1160      niter could be analyzed under some assumptions.  */
1161
1162 bool
1163 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1164                           tree *assumptions, tree *number_of_iterationsm1,
1165                           tree *number_of_iterations, gcond **inner_loop_cond)
1166 {
1167   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1168
1169   /* Different restrictions apply when we are considering an inner-most loop,
1170      vs. an outer (nested) loop.
1171      (FORNOW. May want to relax some of these restrictions in the future).  */
1172
1173   if (!loop->inner)
1174     {
1175       /* Inner-most loop.  We currently require that the number of BBs is
1176          exactly 2 (the header and latch).  Vectorizable inner-most loops
1177          look like this:
1178
1179                         (pre-header)
1180                            |
1181                           header <--------+
1182                            | |            |
1183                            | +--> latch --+
1184                            |
1185                         (exit-bb)  */
1186
1187       if (loop->num_nodes != 2)
1188         {
1189           if (dump_enabled_p ())
1190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191                              "not vectorized: control flow in loop.\n");
1192           return false;
1193         }
1194
1195       if (empty_block_p (loop->header))
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: empty loop.\n");
1200           return false;
1201         }
1202     }
1203   else
1204     {
1205       struct loop *innerloop = loop->inner;
1206       edge entryedge;
1207
1208       /* Nested loop. We currently require that the loop is doubly-nested,
1209          contains a single inner loop, and the number of BBs is exactly 5.
1210          Vectorizable outer-loops look like this:
1211
1212                         (pre-header)
1213                            |
1214                           header <---+
1215                            |         |
1216                           inner-loop |
1217                            |         |
1218                           tail ------+
1219                            |
1220                         (exit-bb)
1221
1222          The inner-loop has the properties expected of inner-most loops
1223          as described above.  */
1224
1225       if ((loop->inner)->inner || (loop->inner)->next)
1226         {
1227           if (dump_enabled_p ())
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: multiple nested loops.\n");
1230           return false;
1231         }
1232
1233       if (loop->num_nodes != 5)
1234         {
1235           if (dump_enabled_p ())
1236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237                              "not vectorized: control flow in loop.\n");
1238           return false;
1239         }
1240
1241       entryedge = loop_preheader_edge (innerloop);
1242       if (entryedge->src != loop->header
1243           || !single_exit (innerloop)
1244           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1245         {
1246           if (dump_enabled_p ())
1247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248                              "not vectorized: unsupported outerloop form.\n");
1249           return false;
1250         }
1251
1252       /* Analyze the inner-loop.  */
1253       tree inner_niterm1, inner_niter, inner_assumptions;
1254       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1255                                       &inner_assumptions, &inner_niterm1,
1256                                       &inner_niter, NULL)
1257           /* Don't support analyzing niter under assumptions for inner
1258              loop.  */
1259           || !integer_onep (inner_assumptions))
1260         {
1261           if (dump_enabled_p ())
1262             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263                              "not vectorized: Bad inner loop.\n");
1264           return false;
1265         }
1266
1267       if (!expr_invariant_in_loop_p (loop, inner_niter))
1268         {
1269           if (dump_enabled_p ())
1270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                              "not vectorized: inner-loop count not"
1272                              " invariant.\n");
1273           return false;
1274         }
1275
1276       if (dump_enabled_p ())
1277         dump_printf_loc (MSG_NOTE, vect_location,
1278                          "Considering outer-loop vectorization.\n");
1279     }
1280
1281   if (!single_exit (loop)
1282       || EDGE_COUNT (loop->header->preds) != 2)
1283     {
1284       if (dump_enabled_p ())
1285         {
1286           if (!single_exit (loop))
1287             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1288                              "not vectorized: multiple exits.\n");
1289           else if (EDGE_COUNT (loop->header->preds) != 2)
1290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                              "not vectorized: too many incoming edges.\n");
1292         }
1293       return false;
1294     }
1295
1296   /* We assume that the loop exit condition is at the end of the loop. i.e,
1297      that the loop is represented as a do-while (with a proper if-guard
1298      before the loop if needed), where the loop header contains all the
1299      executable statements, and the latch is empty.  */
1300   if (!empty_block_p (loop->latch)
1301       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1302     {
1303       if (dump_enabled_p ())
1304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1305                          "not vectorized: latch block not empty.\n");
1306       return false;
1307     }
1308
1309   /* Make sure the exit is not abnormal.  */
1310   edge e = single_exit (loop);
1311   if (e->flags & EDGE_ABNORMAL)
1312     {
1313       if (dump_enabled_p ())
1314         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1315                          "not vectorized: abnormal loop exit edge.\n");
1316       return false;
1317     }
1318
1319   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1320                                      number_of_iterationsm1);
1321   if (!*loop_cond)
1322     {
1323       if (dump_enabled_p ())
1324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325                          "not vectorized: complicated exit condition.\n");
1326       return false;
1327     }
1328
1329   if (integer_zerop (*assumptions)
1330       || !*number_of_iterations
1331       || chrec_contains_undetermined (*number_of_iterations))
1332     {
1333       if (dump_enabled_p ())
1334         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1335                          "not vectorized: number of iterations cannot be "
1336                          "computed.\n");
1337       return false;
1338     }
1339
1340   if (integer_zerop (*number_of_iterations))
1341     {
1342       if (dump_enabled_p ())
1343         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1344                          "not vectorized: number of iterations = 0.\n");
1345       return false;
1346     }
1347
1348   return true;
1349 }
1350
1351 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1352
1353 loop_vec_info
1354 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1355 {
1356   tree assumptions, number_of_iterations, number_of_iterationsm1;
1357   gcond *loop_cond, *inner_loop_cond = NULL;
1358
1359   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1360                                   &assumptions, &number_of_iterationsm1,
1361                                   &number_of_iterations, &inner_loop_cond))
1362     return NULL;
1363
1364   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1365   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1366   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1367   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1368   if (!integer_onep (assumptions))
1369     {
1370       /* We consider to vectorize this loop by versioning it under
1371          some assumptions.  In order to do this, we need to clear
1372          existing information computed by scev and niter analyzer.  */
1373       scev_reset_htab ();
1374       free_numbers_of_iterations_estimates (loop);
1375       /* Also set flag for this loop so that following scev and niter
1376          analysis are done under the assumptions.  */
1377       loop_constraint_set (loop, LOOP_C_FINITE);
1378       /* Also record the assumptions for versioning.  */
1379       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1380     }
1381
1382   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1383     {
1384       if (dump_enabled_p ())
1385         {
1386           dump_printf_loc (MSG_NOTE, vect_location,
1387                            "Symbolic number of iterations is ");
1388           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1389           dump_printf (MSG_NOTE, "\n");
1390         }
1391     }
1392
1393   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1394   if (inner_loop_cond)
1395     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1396       = loop_exit_ctrl_vec_info_type;
1397
1398   gcc_assert (!loop->aux);
1399   loop->aux = loop_vinfo;
1400   return loop_vinfo;
1401 }
1402
1403
1404
1405 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1406    statements update the vectorization factor.  */
1407
1408 static void
1409 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1410 {
1411   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1412   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1413   int nbbs = loop->num_nodes;
1414   poly_uint64 vectorization_factor;
1415   int i;
1416
1417   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1418
1419   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1420   gcc_assert (known_ne (vectorization_factor, 0U));
1421
1422   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1423      vectorization factor of the loop is the unrolling factor required by
1424      the SLP instances.  If that unrolling factor is 1, we say, that we
1425      perform pure SLP on loop - cross iteration parallelism is not
1426      exploited.  */
1427   bool only_slp_in_loop = true;
1428   for (i = 0; i < nbbs; i++)
1429     {
1430       basic_block bb = bbs[i];
1431       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1432            gsi_next (&si))
1433         {
1434           gimple *stmt = gsi_stmt (si);
1435           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1436           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1437               && STMT_VINFO_RELATED_STMT (stmt_info))
1438             {
1439               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1440               stmt_info = vinfo_for_stmt (stmt);
1441             }
1442           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1443                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1444               && !PURE_SLP_STMT (stmt_info))
1445             /* STMT needs both SLP and loop-based vectorization.  */
1446             only_slp_in_loop = false;
1447         }
1448     }
1449
1450   if (only_slp_in_loop)
1451     {
1452       dump_printf_loc (MSG_NOTE, vect_location,
1453                        "Loop contains only SLP stmts\n");
1454       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1455     }
1456   else
1457     {
1458       dump_printf_loc (MSG_NOTE, vect_location,
1459                        "Loop contains SLP and non-SLP stmts\n");
1460       /* Both the vectorization factor and unroll factor have the form
1461          current_vector_size * X for some rational X, so they must have
1462          a common multiple.  */
1463       vectorization_factor
1464         = force_common_multiple (vectorization_factor,
1465                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1466     }
1467
1468   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1469   if (dump_enabled_p ())
1470     {
1471       dump_printf_loc (MSG_NOTE, vect_location,
1472                        "Updating vectorization factor to ");
1473       dump_dec (MSG_NOTE, vectorization_factor);
1474       dump_printf (MSG_NOTE, ".\n");
1475     }
1476 }
1477
1478 /* Return true if STMT_INFO describes a double reduction phi and if
1479    the other phi in the reduction is also relevant for vectorization.
1480    This rejects cases such as:
1481
1482       outer1:
1483         x_1 = PHI <x_3(outer2), ...>;
1484         ...
1485
1486       inner:
1487         x_2 = ...;
1488         ...
1489
1490       outer2:
1491         x_3 = PHI <x_2(inner)>;
1492
1493    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1494
1495 static bool
1496 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1497 {
1498   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1499     return false;
1500
1501   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1502   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1503 }
1504
1505 /* Function vect_analyze_loop_operations.
1506
1507    Scan the loop stmts and make sure they are all vectorizable.  */
1508
1509 static bool
1510 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1511 {
1512   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1513   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1514   int nbbs = loop->num_nodes;
1515   int i;
1516   stmt_vec_info stmt_info;
1517   bool need_to_vectorize = false;
1518   bool ok;
1519
1520   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1521
1522   stmt_vector_for_cost cost_vec;
1523   cost_vec.create (2);
1524
1525   for (i = 0; i < nbbs; i++)
1526     {
1527       basic_block bb = bbs[i];
1528
1529       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1530            gsi_next (&si))
1531         {
1532           gphi *phi = si.phi ();
1533           ok = true;
1534
1535           stmt_info = vinfo_for_stmt (phi);
1536           if (dump_enabled_p ())
1537             {
1538               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1539               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1540             }
1541           if (virtual_operand_p (gimple_phi_result (phi)))
1542             continue;
1543
1544           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1545              (i.e., a phi in the tail of the outer-loop).  */
1546           if (! is_loop_header_bb_p (bb))
1547             {
1548               /* FORNOW: we currently don't support the case that these phis
1549                  are not used in the outerloop (unless it is double reduction,
1550                  i.e., this phi is vect_reduction_def), cause this case
1551                  requires to actually do something here.  */
1552               if (STMT_VINFO_LIVE_P (stmt_info)
1553                   && !vect_active_double_reduction_p (stmt_info))
1554                 {
1555                   if (dump_enabled_p ())
1556                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1557                                      "Unsupported loop-closed phi in "
1558                                      "outer-loop.\n");
1559                   return false;
1560                 }
1561
1562               /* If PHI is used in the outer loop, we check that its operand
1563                  is defined in the inner loop.  */
1564               if (STMT_VINFO_RELEVANT_P (stmt_info))
1565                 {
1566                   tree phi_op;
1567                   gimple *op_def_stmt;
1568
1569                   if (gimple_phi_num_args (phi) != 1)
1570                     return false;
1571
1572                   phi_op = PHI_ARG_DEF (phi, 0);
1573                   if (TREE_CODE (phi_op) != SSA_NAME)
1574                     return false;
1575
1576                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1577                   if (gimple_nop_p (op_def_stmt)
1578                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1579                       || !vinfo_for_stmt (op_def_stmt))
1580                     return false;
1581
1582                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1583                         != vect_used_in_outer
1584                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1585                            != vect_used_in_outer_by_reduction)
1586                     return false;
1587                 }
1588
1589               continue;
1590             }
1591
1592           gcc_assert (stmt_info);
1593
1594           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1595                || STMT_VINFO_LIVE_P (stmt_info))
1596               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1597             {
1598               /* A scalar-dependence cycle that we don't support.  */
1599               if (dump_enabled_p ())
1600                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601                                  "not vectorized: scalar dependence cycle.\n");
1602               return false;
1603             }
1604
1605           if (STMT_VINFO_RELEVANT_P (stmt_info))
1606             {
1607               need_to_vectorize = true;
1608               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1609                   && ! PURE_SLP_STMT (stmt_info))
1610                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1611               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1612                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1613                        && ! PURE_SLP_STMT (stmt_info))
1614                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1615                                              &cost_vec);
1616             }
1617
1618           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1619           if (ok
1620               && STMT_VINFO_LIVE_P (stmt_info)
1621               && !PURE_SLP_STMT (stmt_info))
1622             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1623                                               &cost_vec);
1624
1625           if (!ok)
1626             {
1627               if (dump_enabled_p ())
1628                 {
1629                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630                                    "not vectorized: relevant phi not "
1631                                    "supported: ");
1632                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1633                 }
1634               return false;
1635             }
1636         }
1637
1638       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1639            gsi_next (&si))
1640         {
1641           gimple *stmt = gsi_stmt (si);
1642           if (!gimple_clobber_p (stmt)
1643               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1644                                      &cost_vec))
1645             return false;
1646         }
1647     } /* bbs */
1648
1649   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1650   cost_vec.release ();
1651
1652   /* All operations in the loop are either irrelevant (deal with loop
1653      control, or dead), or only used outside the loop and can be moved
1654      out of the loop (e.g. invariants, inductions).  The loop can be
1655      optimized away by scalar optimizations.  We're better off not
1656      touching this loop.  */
1657   if (!need_to_vectorize)
1658     {
1659       if (dump_enabled_p ())
1660         dump_printf_loc (MSG_NOTE, vect_location,
1661                          "All the computation can be taken out of the loop.\n");
1662       if (dump_enabled_p ())
1663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1664                          "not vectorized: redundant loop. no profit to "
1665                          "vectorize.\n");
1666       return false;
1667     }
1668
1669   return true;
1670 }
1671
1672 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1673    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1674    definitely no, or -1 if it's worth retrying.  */
1675
1676 static int
1677 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1678 {
1679   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1680   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1681
1682   /* Only fully-masked loops can have iteration counts less than the
1683      vectorization factor.  */
1684   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1685     {
1686       HOST_WIDE_INT max_niter;
1687
1688       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1689         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1690       else
1691         max_niter = max_stmt_executions_int (loop);
1692
1693       if (max_niter != -1
1694           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1695         {
1696           if (dump_enabled_p ())
1697             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1698                              "not vectorized: iteration count smaller than "
1699                              "vectorization factor.\n");
1700           return 0;
1701         }
1702     }
1703
1704   int min_profitable_iters, min_profitable_estimate;
1705   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1706                                       &min_profitable_estimate);
1707
1708   if (min_profitable_iters < 0)
1709     {
1710       if (dump_enabled_p ())
1711         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1712                          "not vectorized: vectorization not profitable.\n");
1713       if (dump_enabled_p ())
1714         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1715                          "not vectorized: vector version will never be "
1716                          "profitable.\n");
1717       return -1;
1718     }
1719
1720   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1721                                * assumed_vf);
1722
1723   /* Use the cost model only if it is more conservative than user specified
1724      threshold.  */
1725   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1726                                     min_profitable_iters);
1727
1728   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1729
1730   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1731       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1732     {
1733       if (dump_enabled_p ())
1734         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735                          "not vectorized: vectorization not profitable.\n");
1736       if (dump_enabled_p ())
1737         dump_printf_loc (MSG_NOTE, vect_location,
1738                          "not vectorized: iteration count smaller than user "
1739                          "specified loop bound parameter or minimum profitable "
1740                          "iterations (whichever is more conservative).\n");
1741       return 0;
1742     }
1743
1744   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1745   if (estimated_niter == -1)
1746     estimated_niter = likely_max_stmt_executions_int (loop);
1747   if (estimated_niter != -1
1748       && ((unsigned HOST_WIDE_INT) estimated_niter
1749           < MAX (th, (unsigned) min_profitable_estimate)))
1750     {
1751       if (dump_enabled_p ())
1752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753                          "not vectorized: estimated iteration count too "
1754                          "small.\n");
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_NOTE, vect_location,
1757                          "not vectorized: estimated iteration count smaller "
1758                          "than specified loop bound parameter or minimum "
1759                          "profitable iterations (whichever is more "
1760                          "conservative).\n");
1761       return -1;
1762     }
1763
1764   return 1;
1765 }
1766
1767 static bool
1768 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1769                            vec<data_reference_p> *datarefs,
1770                            unsigned int *n_stmts)
1771 {
1772   *n_stmts = 0;
1773   for (unsigned i = 0; i < loop->num_nodes; i++)
1774     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1775          !gsi_end_p (gsi); gsi_next (&gsi))
1776       {
1777         gimple *stmt = gsi_stmt (gsi);
1778         if (is_gimple_debug (stmt))
1779           continue;
1780         ++(*n_stmts);
1781         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1782           {
1783             if (is_gimple_call (stmt) && loop->safelen)
1784               {
1785                 tree fndecl = gimple_call_fndecl (stmt), op;
1786                 if (fndecl != NULL_TREE)
1787                   {
1788                     cgraph_node *node = cgraph_node::get (fndecl);
1789                     if (node != NULL && node->simd_clones != NULL)
1790                       {
1791                         unsigned int j, n = gimple_call_num_args (stmt);
1792                         for (j = 0; j < n; j++)
1793                           {
1794                             op = gimple_call_arg (stmt, j);
1795                             if (DECL_P (op)
1796                                 || (REFERENCE_CLASS_P (op)
1797                                     && get_base_address (op)))
1798                               break;
1799                           }
1800                         op = gimple_call_lhs (stmt);
1801                         /* Ignore #pragma omp declare simd functions
1802                            if they don't have data references in the
1803                            call stmt itself.  */
1804                         if (j == n
1805                             && !(op
1806                                  && (DECL_P (op)
1807                                      || (REFERENCE_CLASS_P (op)
1808                                          && get_base_address (op)))))
1809                           continue;
1810                       }
1811                   }
1812               }
1813             return false;
1814           }
1815         /* If dependence analysis will give up due to the limit on the
1816            number of datarefs stop here and fail fatally.  */
1817         if (datarefs->length ()
1818             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1819           return false;
1820       }
1821   return true;
1822 }
1823
1824 /* Function vect_analyze_loop_2.
1825
1826    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1827    for it.  The different analyses will record information in the
1828    loop_vec_info struct.  */
1829 static bool
1830 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1831 {
1832   bool ok;
1833   int res;
1834   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1835   poly_uint64 min_vf = 2;
1836
1837   /* The first group of checks is independent of the vector size.  */
1838   fatal = true;
1839
1840   /* Find all data references in the loop (which correspond to vdefs/vuses)
1841      and analyze their evolution in the loop.  */
1842
1843   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1844
1845   /* Gather the data references and count stmts in the loop.  */
1846   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1847     {
1848       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1849                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1850                                       n_stmts))
1851         {
1852           if (dump_enabled_p ())
1853             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854                              "not vectorized: loop contains function "
1855                              "calls or data references that cannot "
1856                              "be analyzed\n");
1857           return false;
1858         }
1859       loop_vinfo->shared->save_datarefs ();
1860     }
1861   else
1862     loop_vinfo->shared->check_datarefs ();
1863
1864   /* Analyze the data references and also adjust the minimal
1865      vectorization factor according to the loads and stores.  */
1866
1867   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1868   if (!ok)
1869     {
1870       if (dump_enabled_p ())
1871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872                          "bad data references.\n");
1873       return false;
1874     }
1875
1876   /* Classify all cross-iteration scalar data-flow cycles.
1877      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1878   vect_analyze_scalar_cycles (loop_vinfo);
1879
1880   vect_pattern_recog (loop_vinfo);
1881
1882   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1883
1884   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1885      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1886
1887   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1888   if (!ok)
1889     {
1890       if (dump_enabled_p ())
1891         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892                          "bad data access.\n");
1893       return false;
1894     }
1895
1896   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1897
1898   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1899   if (!ok)
1900     {
1901       if (dump_enabled_p ())
1902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903                          "unexpected pattern.\n");
1904       return false;
1905     }
1906
1907   /* While the rest of the analysis below depends on it in some way.  */
1908   fatal = false;
1909
1910   /* Analyze data dependences between the data-refs in the loop
1911      and adjust the maximum vectorization factor according to
1912      the dependences.
1913      FORNOW: fail at the first data dependence that we encounter.  */
1914
1915   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1916   if (!ok
1917       || (max_vf != MAX_VECTORIZATION_FACTOR
1918           && maybe_lt (max_vf, min_vf)))
1919     {
1920       if (dump_enabled_p ())
1921             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922                              "bad data dependence.\n");
1923       return false;
1924     }
1925   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1926
1927   ok = vect_determine_vectorization_factor (loop_vinfo);
1928   if (!ok)
1929     {
1930       if (dump_enabled_p ())
1931         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1932                          "can't determine vectorization factor.\n");
1933       return false;
1934     }
1935   if (max_vf != MAX_VECTORIZATION_FACTOR
1936       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1937     {
1938       if (dump_enabled_p ())
1939         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940                          "bad data dependence.\n");
1941       return false;
1942     }
1943
1944   /* Compute the scalar iteration cost.  */
1945   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1946
1947   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948   unsigned th;
1949
1950   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1951   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1952   if (!ok)
1953     return false;
1954
1955   /* If there are any SLP instances mark them as pure_slp.  */
1956   bool slp = vect_make_slp_decision (loop_vinfo);
1957   if (slp)
1958     {
1959       /* Find stmts that need to be both vectorized and SLPed.  */
1960       vect_detect_hybrid_slp (loop_vinfo);
1961
1962       /* Update the vectorization factor based on the SLP decision.  */
1963       vect_update_vf_for_slp (loop_vinfo);
1964     }
1965
1966   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1967
1968   /* We don't expect to have to roll back to anything other than an empty
1969      set of rgroups.  */
1970   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1971
1972   /* This is the point where we can re-start analysis with SLP forced off.  */
1973 start_over:
1974
1975   /* Now the vectorization factor is final.  */
1976   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1977   gcc_assert (known_ne (vectorization_factor, 0U));
1978
1979   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1980     {
1981       dump_printf_loc (MSG_NOTE, vect_location,
1982                        "vectorization_factor = ");
1983       dump_dec (MSG_NOTE, vectorization_factor);
1984       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1985                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1986     }
1987
1988   HOST_WIDE_INT max_niter
1989     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1990
1991   /* Analyze the alignment of the data-refs in the loop.
1992      Fail if a data reference is found that cannot be vectorized.  */
1993
1994   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1995   if (!ok)
1996     {
1997       if (dump_enabled_p ())
1998         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1999                          "bad data alignment.\n");
2000       return false;
2001     }
2002
2003   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2004      It is important to call pruning after vect_analyze_data_ref_accesses,
2005      since we use grouping information gathered by interleaving analysis.  */
2006   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2007   if (!ok)
2008     return false;
2009
2010   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2011      vectorization.  */
2012   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2013     {
2014     /* This pass will decide on using loop versioning and/or loop peeling in
2015        order to enhance the alignment of data references in the loop.  */
2016     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2017     if (!ok)
2018       {
2019         if (dump_enabled_p ())
2020           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2021                            "bad data alignment.\n");
2022         return false;
2023       }
2024     }
2025
2026   if (slp)
2027     {
2028       /* Analyze operations in the SLP instances.  Note this may
2029          remove unsupported SLP instances which makes the above
2030          SLP kind detection invalid.  */
2031       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2032       vect_slp_analyze_operations (loop_vinfo);
2033       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2034         goto again;
2035     }
2036
2037   /* Scan all the remaining operations in the loop that are not subject
2038      to SLP and make sure they are vectorizable.  */
2039   ok = vect_analyze_loop_operations (loop_vinfo);
2040   if (!ok)
2041     {
2042       if (dump_enabled_p ())
2043         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2044                          "bad operation or unsupported loop bound.\n");
2045       return false;
2046     }
2047
2048   /* Decide whether to use a fully-masked loop for this vectorization
2049      factor.  */
2050   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2051     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2052        && vect_verify_full_masking (loop_vinfo));
2053   if (dump_enabled_p ())
2054     {
2055       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2056         dump_printf_loc (MSG_NOTE, vect_location,
2057                          "using a fully-masked loop.\n");
2058       else
2059         dump_printf_loc (MSG_NOTE, vect_location,
2060                          "not using a fully-masked loop.\n");
2061     }
2062
2063   /* If epilog loop is required because of data accesses with gaps,
2064      one additional iteration needs to be peeled.  Check if there is
2065      enough iterations for vectorization.  */
2066   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2067       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2068       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2069     {
2070       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2071       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2072
2073       if (known_lt (wi::to_widest (scalar_niters), vf))
2074         {
2075           if (dump_enabled_p ())
2076             dump_printf_loc (MSG_NOTE, vect_location,
2077                              "loop has no enough iterations to support"
2078                              " peeling for gaps.\n");
2079           return false;
2080         }
2081     }
2082
2083   /* Check the costings of the loop make vectorizing worthwhile.  */
2084   res = vect_analyze_loop_costing (loop_vinfo);
2085   if (res < 0)
2086     goto again;
2087   if (!res)
2088     {
2089       if (dump_enabled_p ())
2090         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2091                          "Loop costings not worthwhile.\n");
2092       return false;
2093     }
2094
2095   /* Decide whether we need to create an epilogue loop to handle
2096      remaining scalar iterations.  */
2097   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2098
2099   unsigned HOST_WIDE_INT const_vf;
2100   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2101     /* The main loop handles all iterations.  */
2102     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2103   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2104            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2105     {
2106       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2107                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2108                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2109         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2110     }
2111   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2112            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2113            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2114                 < (unsigned) exact_log2 (const_vf))
2115                /* In case of versioning, check if the maximum number of
2116                   iterations is greater than th.  If they are identical,
2117                   the epilogue is unnecessary.  */
2118                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2119                    || ((unsigned HOST_WIDE_INT) max_niter
2120                        > (th / const_vf) * const_vf))))
2121     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2122
2123   /* If an epilogue loop is required make sure we can create one.  */
2124   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2125       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2126     {
2127       if (dump_enabled_p ())
2128         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2129       if (!vect_can_advance_ivs_p (loop_vinfo)
2130           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2131                                            single_exit (LOOP_VINFO_LOOP
2132                                                          (loop_vinfo))))
2133         {
2134           if (dump_enabled_p ())
2135             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136                              "not vectorized: can't create required "
2137                              "epilog loop\n");
2138           goto again;
2139         }
2140     }
2141
2142   /* During peeling, we need to check if number of loop iterations is
2143      enough for both peeled prolog loop and vector loop.  This check
2144      can be merged along with threshold check of loop versioning, so
2145      increase threshold for this case if necessary.  */
2146   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2147     {
2148       poly_uint64 niters_th = 0;
2149
2150       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2151         {
2152           /* Niters for peeled prolog loop.  */
2153           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2154             {
2155               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2156               tree vectype
2157                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2158               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2159             }
2160           else
2161             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2162         }
2163
2164       /* Niters for at least one iteration of vectorized loop.  */
2165       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2166         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2167       /* One additional iteration because of peeling for gap.  */
2168       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2169         niters_th += 1;
2170       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2171     }
2172
2173   gcc_assert (known_eq (vectorization_factor,
2174                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2175
2176   /* Ok to vectorize!  */
2177   return true;
2178
2179 again:
2180   /* Try again with SLP forced off but if we didn't do any SLP there is
2181      no point in re-trying.  */
2182   if (!slp)
2183     return false;
2184
2185   /* If there are reduction chains re-trying will fail anyway.  */
2186   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2187     return false;
2188
2189   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2190      via interleaving or lane instructions.  */
2191   slp_instance instance;
2192   slp_tree node;
2193   unsigned i, j;
2194   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2195     {
2196       stmt_vec_info vinfo;
2197       vinfo = vinfo_for_stmt
2198           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2199       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2200         continue;
2201       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2202       unsigned int size = DR_GROUP_SIZE (vinfo);
2203       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2204       if (! vect_store_lanes_supported (vectype, size, false)
2205          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2206          && ! vect_grouped_store_supported (vectype, size))
2207        return false;
2208       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2209         {
2210           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2211           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2212           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2213           size = DR_GROUP_SIZE (vinfo);
2214           vectype = STMT_VINFO_VECTYPE (vinfo);
2215           if (! vect_load_lanes_supported (vectype, size, false)
2216               && ! vect_grouped_load_supported (vectype, single_element_p,
2217                                                 size))
2218             return false;
2219         }
2220     }
2221
2222   if (dump_enabled_p ())
2223     dump_printf_loc (MSG_NOTE, vect_location,
2224                      "re-trying with SLP disabled\n");
2225
2226   /* Roll back state appropriately.  No SLP this time.  */
2227   slp = false;
2228   /* Restore vectorization factor as it were without SLP.  */
2229   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2230   /* Free the SLP instances.  */
2231   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2232     vect_free_slp_instance (instance, false);
2233   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2234   /* Reset SLP type to loop_vect on all stmts.  */
2235   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2236     {
2237       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2238       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2239            !gsi_end_p (si); gsi_next (&si))
2240         {
2241           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2242           STMT_SLP_TYPE (stmt_info) = loop_vect;
2243         }
2244       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2245            !gsi_end_p (si); gsi_next (&si))
2246         {
2247           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2248           STMT_SLP_TYPE (stmt_info) = loop_vect;
2249           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2250             {
2251               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2252               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2253               STMT_SLP_TYPE (stmt_info) = loop_vect;
2254               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2255                    !gsi_end_p (pi); gsi_next (&pi))
2256                 {
2257                   gimple *pstmt = gsi_stmt (pi);
2258                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2259                 }
2260             }
2261         }
2262     }
2263   /* Free optimized alias test DDRS.  */
2264   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2265   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2266   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2267   /* Reset target cost data.  */
2268   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2269   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2270     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2271   /* Reset accumulated rgroup information.  */
2272   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2273   /* Reset assorted flags.  */
2274   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2275   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2276   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2277   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2278   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2279
2280   goto start_over;
2281 }
2282
2283 /* Function vect_analyze_loop.
2284
2285    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2286    for it.  The different analyses will record information in the
2287    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2288    be vectorized.  */
2289 loop_vec_info
2290 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2291                    vec_info_shared *shared)
2292 {
2293   loop_vec_info loop_vinfo;
2294   auto_vector_sizes vector_sizes;
2295
2296   /* Autodetect first vector size we try.  */
2297   current_vector_size = 0;
2298   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2299   unsigned int next_size = 0;
2300
2301   DUMP_VECT_SCOPE ("analyze_loop_nest");
2302
2303   if (loop_outer (loop)
2304       && loop_vec_info_for_loop (loop_outer (loop))
2305       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2306     {
2307       if (dump_enabled_p ())
2308         dump_printf_loc (MSG_NOTE, vect_location,
2309                          "outer-loop already vectorized.\n");
2310       return NULL;
2311     }
2312
2313   if (!find_loop_nest (loop, &shared->loop_nest))
2314     {
2315       if (dump_enabled_p ())
2316         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                          "not vectorized: loop nest containing two "
2318                          "or more consecutive inner loops cannot be "
2319                          "vectorized\n");
2320       return NULL;
2321     }
2322
2323   unsigned n_stmts = 0;
2324   poly_uint64 autodetected_vector_size = 0;
2325   while (1)
2326     {
2327       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2328       loop_vinfo = vect_analyze_loop_form (loop, shared);
2329       if (!loop_vinfo)
2330         {
2331           if (dump_enabled_p ())
2332             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                              "bad loop form.\n");
2334           return NULL;
2335         }
2336
2337       bool fatal = false;
2338
2339       if (orig_loop_vinfo)
2340         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2341
2342       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2343         {
2344           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2345
2346           return loop_vinfo;
2347         }
2348
2349       delete loop_vinfo;
2350
2351       if (next_size == 0)
2352         autodetected_vector_size = current_vector_size;
2353
2354       if (next_size < vector_sizes.length ()
2355           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2356         next_size += 1;
2357
2358       if (fatal
2359           || next_size == vector_sizes.length ()
2360           || known_eq (current_vector_size, 0U))
2361         return NULL;
2362
2363       /* Try the next biggest vector size.  */
2364       current_vector_size = vector_sizes[next_size++];
2365       if (dump_enabled_p ())
2366         {
2367           dump_printf_loc (MSG_NOTE, vect_location,
2368                            "***** Re-trying analysis with "
2369                            "vector size ");
2370           dump_dec (MSG_NOTE, current_vector_size);
2371           dump_printf (MSG_NOTE, "\n");
2372         }
2373     }
2374 }
2375
2376 /* Return true if there is an in-order reduction function for CODE, storing
2377    it in *REDUC_FN if so.  */
2378
2379 static bool
2380 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2381 {
2382   switch (code)
2383     {
2384     case PLUS_EXPR:
2385       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2386       return true;
2387
2388     default:
2389       return false;
2390     }
2391 }
2392
2393 /* Function reduction_fn_for_scalar_code
2394
2395    Input:
2396    CODE - tree_code of a reduction operations.
2397
2398    Output:
2399    REDUC_FN - the corresponding internal function to be used to reduce the
2400       vector of partial results into a single scalar result, or IFN_LAST
2401       if the operation is a supported reduction operation, but does not have
2402       such an internal function.
2403
2404    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2405
2406 static bool
2407 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2408 {
2409   switch (code)
2410     {
2411       case MAX_EXPR:
2412         *reduc_fn = IFN_REDUC_MAX;
2413         return true;
2414
2415       case MIN_EXPR:
2416         *reduc_fn = IFN_REDUC_MIN;
2417         return true;
2418
2419       case PLUS_EXPR:
2420         *reduc_fn = IFN_REDUC_PLUS;
2421         return true;
2422
2423       case BIT_AND_EXPR:
2424         *reduc_fn = IFN_REDUC_AND;
2425         return true;
2426
2427       case BIT_IOR_EXPR:
2428         *reduc_fn = IFN_REDUC_IOR;
2429         return true;
2430
2431       case BIT_XOR_EXPR:
2432         *reduc_fn = IFN_REDUC_XOR;
2433         return true;
2434
2435       case MULT_EXPR:
2436       case MINUS_EXPR:
2437         *reduc_fn = IFN_LAST;
2438         return true;
2439
2440       default:
2441        return false;
2442     }
2443 }
2444
2445 /* If there is a neutral value X such that SLP reduction NODE would not
2446    be affected by the introduction of additional X elements, return that X,
2447    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2448    is true if the SLP statements perform a single reduction, false if each
2449    statement performs an independent reduction.  */
2450
2451 static tree
2452 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2453                               bool reduc_chain)
2454 {
2455   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2456   gimple *stmt = stmts[0];
2457   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2458   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2459   tree scalar_type = TREE_TYPE (vector_type);
2460   struct loop *loop = gimple_bb (stmt)->loop_father;
2461   gcc_assert (loop);
2462
2463   switch (code)
2464     {
2465     case WIDEN_SUM_EXPR:
2466     case DOT_PROD_EXPR:
2467     case SAD_EXPR:
2468     case PLUS_EXPR:
2469     case MINUS_EXPR:
2470     case BIT_IOR_EXPR:
2471     case BIT_XOR_EXPR:
2472       return build_zero_cst (scalar_type);
2473
2474     case MULT_EXPR:
2475       return build_one_cst (scalar_type);
2476
2477     case BIT_AND_EXPR:
2478       return build_all_ones_cst (scalar_type);
2479
2480     case MAX_EXPR:
2481     case MIN_EXPR:
2482       /* For MIN/MAX the initial values are neutral.  A reduction chain
2483          has only a single initial value, so that value is neutral for
2484          all statements.  */
2485       if (reduc_chain)
2486         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2487       return NULL_TREE;
2488
2489     default:
2490       return NULL_TREE;
2491     }
2492 }
2493
2494 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2495    STMT is printed with a message MSG. */
2496
2497 static void
2498 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2499 {
2500   dump_printf_loc (msg_type, vect_location, "%s", msg);
2501   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2502 }
2503
2504
2505 /* Detect SLP reduction of the form:
2506
2507    #a1 = phi <a5, a0>
2508    a2 = operation (a1)
2509    a3 = operation (a2)
2510    a4 = operation (a3)
2511    a5 = operation (a4)
2512
2513    #a = phi <a5>
2514
2515    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2516    FIRST_STMT is the first reduction stmt in the chain
2517    (a2 = operation (a1)).
2518
2519    Return TRUE if a reduction chain was detected.  */
2520
2521 static bool
2522 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2523                        gimple *first_stmt)
2524 {
2525   struct loop *loop = (gimple_bb (phi))->loop_father;
2526   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2527   enum tree_code code;
2528   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2529   stmt_vec_info use_stmt_info, current_stmt_info;
2530   tree lhs;
2531   imm_use_iterator imm_iter;
2532   use_operand_p use_p;
2533   int nloop_uses, size = 0, n_out_of_loop_uses;
2534   bool found = false;
2535
2536   if (loop != vect_loop)
2537     return false;
2538
2539   lhs = PHI_RESULT (phi);
2540   code = gimple_assign_rhs_code (first_stmt);
2541   while (1)
2542     {
2543       nloop_uses = 0;
2544       n_out_of_loop_uses = 0;
2545       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2546         {
2547           gimple *use_stmt = USE_STMT (use_p);
2548           if (is_gimple_debug (use_stmt))
2549             continue;
2550
2551           /* Check if we got back to the reduction phi.  */
2552           if (use_stmt == phi)
2553             {
2554               loop_use_stmt = use_stmt;
2555               found = true;
2556               break;
2557             }
2558
2559           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2560             {
2561               loop_use_stmt = use_stmt;
2562               nloop_uses++;
2563             }
2564            else
2565              n_out_of_loop_uses++;
2566
2567            /* There are can be either a single use in the loop or two uses in
2568               phi nodes.  */
2569            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2570              return false;
2571         }
2572
2573       if (found)
2574         break;
2575
2576       /* We reached a statement with no loop uses.  */
2577       if (nloop_uses == 0)
2578         return false;
2579
2580       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2581       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2582         return false;
2583
2584       if (!is_gimple_assign (loop_use_stmt)
2585           || code != gimple_assign_rhs_code (loop_use_stmt)
2586           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2587         return false;
2588
2589       /* Insert USE_STMT into reduction chain.  */
2590       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2591       if (current_stmt)
2592         {
2593           current_stmt_info = vinfo_for_stmt (current_stmt);
2594           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2595           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2596             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2597         }
2598       else
2599         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2600
2601       lhs = gimple_assign_lhs (loop_use_stmt);
2602       current_stmt = loop_use_stmt;
2603       size++;
2604    }
2605
2606   if (!found || loop_use_stmt != phi || size < 2)
2607     return false;
2608
2609   /* Swap the operands, if needed, to make the reduction operand be the second
2610      operand.  */
2611   lhs = PHI_RESULT (phi);
2612   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2613   while (next_stmt)
2614     {
2615       if (gimple_assign_rhs2 (next_stmt) == lhs)
2616         {
2617           tree op = gimple_assign_rhs1 (next_stmt);
2618           gimple *def_stmt = NULL;
2619
2620           if (TREE_CODE (op) == SSA_NAME)
2621             def_stmt = SSA_NAME_DEF_STMT (op);
2622
2623           /* Check that the other def is either defined in the loop
2624              ("vect_internal_def"), or it's an induction (defined by a
2625              loop-header phi-node).  */
2626           if (def_stmt
2627               && gimple_bb (def_stmt)
2628               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2629               && (is_gimple_assign (def_stmt)
2630                   || is_gimple_call (def_stmt)
2631                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2632                            == vect_induction_def
2633                   || (gimple_code (def_stmt) == GIMPLE_PHI
2634                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2635                                   == vect_internal_def
2636                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2637             {
2638               lhs = gimple_assign_lhs (next_stmt);
2639               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2640               continue;
2641             }
2642
2643           return false;
2644         }
2645       else
2646         {
2647           tree op = gimple_assign_rhs2 (next_stmt);
2648           gimple *def_stmt = NULL;
2649
2650           if (TREE_CODE (op) == SSA_NAME)
2651             def_stmt = SSA_NAME_DEF_STMT (op);
2652
2653           /* Check that the other def is either defined in the loop
2654             ("vect_internal_def"), or it's an induction (defined by a
2655             loop-header phi-node).  */
2656           if (def_stmt
2657               && gimple_bb (def_stmt)
2658               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2659               && (is_gimple_assign (def_stmt)
2660                   || is_gimple_call (def_stmt)
2661                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2662                               == vect_induction_def
2663                   || (gimple_code (def_stmt) == GIMPLE_PHI
2664                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2665                                   == vect_internal_def
2666                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2667             {
2668               if (dump_enabled_p ())
2669                 {
2670                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2671                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2672                 }
2673
2674               swap_ssa_operands (next_stmt,
2675                                  gimple_assign_rhs1_ptr (next_stmt),
2676                                  gimple_assign_rhs2_ptr (next_stmt));
2677               update_stmt (next_stmt);
2678
2679               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2680                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2681             }
2682           else
2683             return false;
2684         }
2685
2686       lhs = gimple_assign_lhs (next_stmt);
2687       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2688     }
2689
2690   /* Save the chain for further analysis in SLP detection.  */
2691   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2692   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2693   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2694
2695   return true;
2696 }
2697
2698 /* Return true if we need an in-order reduction for operation CODE
2699    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2700    overflow must wrap.  */
2701
2702 static bool
2703 needs_fold_left_reduction_p (tree type, tree_code code,
2704                              bool need_wrapping_integral_overflow)
2705 {
2706   /* CHECKME: check for !flag_finite_math_only too?  */
2707   if (SCALAR_FLOAT_TYPE_P (type))
2708     switch (code)
2709       {
2710       case MIN_EXPR:
2711       case MAX_EXPR:
2712         return false;
2713
2714       default:
2715         return !flag_associative_math;
2716       }
2717
2718   if (INTEGRAL_TYPE_P (type))
2719     {
2720       if (!operation_no_trapping_overflow (type, code))
2721         return true;
2722       if (need_wrapping_integral_overflow
2723           && !TYPE_OVERFLOW_WRAPS (type)
2724           && operation_can_overflow (code))
2725         return true;
2726       return false;
2727     }
2728
2729   if (SAT_FIXED_POINT_TYPE_P (type))
2730     return true;
2731
2732   return false;
2733 }
2734
2735 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2736    reduction operation CODE has a handled computation expression.  */
2737
2738 bool
2739 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2740                       tree loop_arg, enum tree_code code)
2741 {
2742   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2743   auto_bitmap visited;
2744   tree lookfor = PHI_RESULT (phi);
2745   ssa_op_iter curri;
2746   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2747   while (USE_FROM_PTR (curr) != loop_arg)
2748     curr = op_iter_next_use (&curri);
2749   curri.i = curri.numops;
2750   do
2751     {
2752       path.safe_push (std::make_pair (curri, curr));
2753       tree use = USE_FROM_PTR (curr);
2754       if (use == lookfor)
2755         break;
2756       gimple *def = SSA_NAME_DEF_STMT (use);
2757       if (gimple_nop_p (def)
2758           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2759         {
2760 pop:
2761           do
2762             {
2763               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2764               curri = x.first;
2765               curr = x.second;
2766               do
2767                 curr = op_iter_next_use (&curri);
2768               /* Skip already visited or non-SSA operands (from iterating
2769                  over PHI args).  */
2770               while (curr != NULL_USE_OPERAND_P
2771                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2772                          || ! bitmap_set_bit (visited,
2773                                               SSA_NAME_VERSION
2774                                                 (USE_FROM_PTR (curr)))));
2775             }
2776           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2777           if (curr == NULL_USE_OPERAND_P)
2778             break;
2779         }
2780       else
2781         {
2782           if (gimple_code (def) == GIMPLE_PHI)
2783             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2784           else
2785             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2786           while (curr != NULL_USE_OPERAND_P
2787                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2788                      || ! bitmap_set_bit (visited,
2789                                           SSA_NAME_VERSION
2790                                             (USE_FROM_PTR (curr)))))
2791             curr = op_iter_next_use (&curri);
2792           if (curr == NULL_USE_OPERAND_P)
2793             goto pop;
2794         }
2795     }
2796   while (1);
2797   if (dump_file && (dump_flags & TDF_DETAILS))
2798     {
2799       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2800       unsigned i;
2801       std::pair<ssa_op_iter, use_operand_p> *x;
2802       FOR_EACH_VEC_ELT (path, i, x)
2803         {
2804           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2805           dump_printf (MSG_NOTE, " ");
2806         }
2807       dump_printf (MSG_NOTE, "\n");
2808     }
2809
2810   /* Check whether the reduction path detected is valid.  */
2811   bool fail = path.length () == 0;
2812   bool neg = false;
2813   for (unsigned i = 1; i < path.length (); ++i)
2814     {
2815       gimple *use_stmt = USE_STMT (path[i].second);
2816       tree op = USE_FROM_PTR (path[i].second);
2817       if (! has_single_use (op)
2818           || ! is_gimple_assign (use_stmt))
2819         {
2820           fail = true;
2821           break;
2822         }
2823       if (gimple_assign_rhs_code (use_stmt) != code)
2824         {
2825           if (code == PLUS_EXPR
2826               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2827             {
2828               /* Track whether we negate the reduction value each iteration.  */
2829               if (gimple_assign_rhs2 (use_stmt) == op)
2830                 neg = ! neg;
2831             }
2832           else
2833             {
2834               fail = true;
2835               break;
2836             }
2837         }
2838     }
2839   return ! fail && ! neg;
2840 }
2841
2842
2843 /* Function vect_is_simple_reduction
2844
2845    (1) Detect a cross-iteration def-use cycle that represents a simple
2846    reduction computation.  We look for the following pattern:
2847
2848    loop_header:
2849      a1 = phi < a0, a2 >
2850      a3 = ...
2851      a2 = operation (a3, a1)
2852
2853    or
2854
2855    a3 = ...
2856    loop_header:
2857      a1 = phi < a0, a2 >
2858      a2 = operation (a3, a1)
2859
2860    such that:
2861    1. operation is commutative and associative and it is safe to
2862       change the order of the computation
2863    2. no uses for a2 in the loop (a2 is used out of the loop)
2864    3. no uses of a1 in the loop besides the reduction operation
2865    4. no uses of a1 outside the loop.
2866
2867    Conditions 1,4 are tested here.
2868    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2869
2870    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2871    nested cycles.
2872
2873    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2874    reductions:
2875
2876      a1 = phi < a0, a2 >
2877      inner loop (def of a3)
2878      a2 = phi < a3 >
2879
2880    (4) Detect condition expressions, ie:
2881      for (int i = 0; i < N; i++)
2882        if (a[i] < val)
2883         ret_val = a[i];
2884
2885 */
2886
2887 static gimple *
2888 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2889                           bool *double_reduc,
2890                           bool need_wrapping_integral_overflow,
2891                           enum vect_reduction_type *v_reduc_type)
2892 {
2893   struct loop *loop = (gimple_bb (phi))->loop_father;
2894   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2895   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2896   enum tree_code orig_code, code;
2897   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2898   tree type;
2899   int nloop_uses;
2900   tree name;
2901   imm_use_iterator imm_iter;
2902   use_operand_p use_p;
2903   bool phi_def;
2904
2905   *double_reduc = false;
2906   *v_reduc_type = TREE_CODE_REDUCTION;
2907
2908   tree phi_name = PHI_RESULT (phi);
2909   /* ???  If there are no uses of the PHI result the inner loop reduction
2910      won't be detected as possibly double-reduction by vectorizable_reduction
2911      because that tries to walk the PHI arg from the preheader edge which
2912      can be constant.  See PR60382.  */
2913   if (has_zero_uses (phi_name))
2914     return NULL;
2915   nloop_uses = 0;
2916   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2917     {
2918       gimple *use_stmt = USE_STMT (use_p);
2919       if (is_gimple_debug (use_stmt))
2920         continue;
2921
2922       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2923         {
2924           if (dump_enabled_p ())
2925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2926                              "intermediate value used outside loop.\n");
2927
2928           return NULL;
2929         }
2930
2931       nloop_uses++;
2932       if (nloop_uses > 1)
2933         {
2934           if (dump_enabled_p ())
2935             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2936                              "reduction value used in loop.\n");
2937           return NULL;
2938         }
2939
2940       phi_use_stmt = use_stmt;
2941     }
2942
2943   edge latch_e = loop_latch_edge (loop);
2944   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2945   if (TREE_CODE (loop_arg) != SSA_NAME)
2946     {
2947       if (dump_enabled_p ())
2948         {
2949           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2950                            "reduction: not ssa_name: ");
2951           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2952           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2953         }
2954       return NULL;
2955     }
2956
2957   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2958   if (is_gimple_assign (def_stmt))
2959     {
2960       name = gimple_assign_lhs (def_stmt);
2961       phi_def = false;
2962     }
2963   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2964     {
2965       name = PHI_RESULT (def_stmt);
2966       phi_def = true;
2967     }
2968   else
2969     {
2970       if (dump_enabled_p ())
2971         {
2972           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2973                            "reduction: unhandled reduction operation: ");
2974           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2975         }
2976       return NULL;
2977     }
2978
2979   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2980     return NULL;
2981
2982   nloop_uses = 0;
2983   auto_vec<gphi *, 3> lcphis;
2984   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2985     {
2986       gimple *use_stmt = USE_STMT (use_p);
2987       if (is_gimple_debug (use_stmt))
2988         continue;
2989       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2990         nloop_uses++;
2991       else
2992         /* We can have more than one loop-closed PHI.  */
2993         lcphis.safe_push (as_a <gphi *> (use_stmt));
2994       if (nloop_uses > 1)
2995         {
2996           if (dump_enabled_p ())
2997             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998                              "reduction used in loop.\n");
2999           return NULL;
3000         }
3001     }
3002
3003   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3004      defined in the inner loop.  */
3005   if (phi_def)
3006     {
3007       op1 = PHI_ARG_DEF (def_stmt, 0);
3008
3009       if (gimple_phi_num_args (def_stmt) != 1
3010           || TREE_CODE (op1) != SSA_NAME)
3011         {
3012           if (dump_enabled_p ())
3013             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3014                              "unsupported phi node definition.\n");
3015
3016           return NULL;
3017         }
3018
3019       def1 = SSA_NAME_DEF_STMT (op1);
3020       if (gimple_bb (def1)
3021           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3022           && loop->inner
3023           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3024           && is_gimple_assign (def1)
3025           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3026         {
3027           if (dump_enabled_p ())
3028             report_vect_op (MSG_NOTE, def_stmt,
3029                             "detected double reduction: ");
3030
3031           *double_reduc = true;
3032           return def_stmt;
3033         }
3034
3035       return NULL;
3036     }
3037
3038   /* If we are vectorizing an inner reduction we are executing that
3039      in the original order only in case we are not dealing with a
3040      double reduction.  */
3041   bool check_reduction = true;
3042   if (flow_loop_nested_p (vect_loop, loop))
3043     {
3044       gphi *lcphi;
3045       unsigned i;
3046       check_reduction = false;
3047       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3048         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3049           {
3050             gimple *use_stmt = USE_STMT (use_p);
3051             if (is_gimple_debug (use_stmt))
3052               continue;
3053             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3054               check_reduction = true;
3055           }
3056     }
3057
3058   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3059   code = orig_code = gimple_assign_rhs_code (def_stmt);
3060
3061   /* We can handle "res -= x[i]", which is non-associative by
3062      simply rewriting this into "res += -x[i]".  Avoid changing
3063      gimple instruction for the first simple tests and only do this
3064      if we're allowed to change code at all.  */
3065   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3066     code = PLUS_EXPR;
3067
3068   if (code == COND_EXPR)
3069     {
3070       if (! nested_in_vect_loop)
3071         *v_reduc_type = COND_REDUCTION;
3072
3073       op3 = gimple_assign_rhs1 (def_stmt);
3074       if (COMPARISON_CLASS_P (op3))
3075         {
3076           op4 = TREE_OPERAND (op3, 1);
3077           op3 = TREE_OPERAND (op3, 0);
3078         }
3079       if (op3 == phi_name || op4 == phi_name)
3080         {
3081           if (dump_enabled_p ())
3082             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3083                             "reduction: condition depends on previous"
3084                             " iteration: ");
3085           return NULL;
3086         }
3087
3088       op1 = gimple_assign_rhs2 (def_stmt);
3089       op2 = gimple_assign_rhs3 (def_stmt);
3090     }
3091   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3092     {
3093       if (dump_enabled_p ())
3094         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3095                         "reduction: not commutative/associative: ");
3096       return NULL;
3097     }
3098   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3099     {
3100       op1 = gimple_assign_rhs1 (def_stmt);
3101       op2 = gimple_assign_rhs2 (def_stmt);
3102     }
3103   else
3104     {
3105       if (dump_enabled_p ())
3106         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3107                         "reduction: not handled operation: ");
3108       return NULL;
3109     }
3110
3111   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3112     {
3113       if (dump_enabled_p ())
3114         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3115                         "reduction: both uses not ssa_names: ");
3116
3117       return NULL;
3118     }
3119
3120   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3121   if ((TREE_CODE (op1) == SSA_NAME
3122        && !types_compatible_p (type,TREE_TYPE (op1)))
3123       || (TREE_CODE (op2) == SSA_NAME
3124           && !types_compatible_p (type, TREE_TYPE (op2)))
3125       || (op3 && TREE_CODE (op3) == SSA_NAME
3126           && !types_compatible_p (type, TREE_TYPE (op3)))
3127       || (op4 && TREE_CODE (op4) == SSA_NAME
3128           && !types_compatible_p (type, TREE_TYPE (op4))))
3129     {
3130       if (dump_enabled_p ())
3131         {
3132           dump_printf_loc (MSG_NOTE, vect_location,
3133                            "reduction: multiple types: operation type: ");
3134           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3135           dump_printf (MSG_NOTE, ", operands types: ");
3136           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3137                              TREE_TYPE (op1));
3138           dump_printf (MSG_NOTE, ",");
3139           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3140                              TREE_TYPE (op2));
3141           if (op3)
3142             {
3143               dump_printf (MSG_NOTE, ",");
3144               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3145                                  TREE_TYPE (op3));
3146             }
3147
3148           if (op4)
3149             {
3150               dump_printf (MSG_NOTE, ",");
3151               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3152                                  TREE_TYPE (op4));
3153             }
3154           dump_printf (MSG_NOTE, "\n");
3155         }
3156
3157       return NULL;
3158     }
3159
3160   /* Check whether it's ok to change the order of the computation.
3161      Generally, when vectorizing a reduction we change the order of the
3162      computation.  This may change the behavior of the program in some
3163      cases, so we need to check that this is ok.  One exception is when
3164      vectorizing an outer-loop: the inner-loop is executed sequentially,
3165      and therefore vectorizing reductions in the inner-loop during
3166      outer-loop vectorization is safe.  */
3167   if (check_reduction
3168       && *v_reduc_type == TREE_CODE_REDUCTION
3169       && needs_fold_left_reduction_p (type, code,
3170                                       need_wrapping_integral_overflow))
3171     *v_reduc_type = FOLD_LEFT_REDUCTION;
3172
3173   /* Reduction is safe. We're dealing with one of the following:
3174      1) integer arithmetic and no trapv
3175      2) floating point arithmetic, and special flags permit this optimization
3176      3) nested cycle (i.e., outer loop vectorization).  */
3177   if (TREE_CODE (op1) == SSA_NAME)
3178     def1 = SSA_NAME_DEF_STMT (op1);
3179
3180   if (TREE_CODE (op2) == SSA_NAME)
3181     def2 = SSA_NAME_DEF_STMT (op2);
3182
3183   if (code != COND_EXPR
3184       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3185     {
3186       if (dump_enabled_p ())
3187         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3188       return NULL;
3189     }
3190
3191   /* Check that one def is the reduction def, defined by PHI,
3192      the other def is either defined in the loop ("vect_internal_def"),
3193      or it's an induction (defined by a loop-header phi-node).  */
3194
3195   if (def2 && def2 == phi
3196       && (code == COND_EXPR
3197           || !def1 || gimple_nop_p (def1)
3198           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3199           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3200               && (is_gimple_assign (def1)
3201                   || is_gimple_call (def1)
3202                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3203                       == vect_induction_def
3204                   || (gimple_code (def1) == GIMPLE_PHI
3205                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3206                           == vect_internal_def
3207                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3208     {
3209       if (dump_enabled_p ())
3210         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3211       return def_stmt;
3212     }
3213
3214   if (def1 && def1 == phi
3215       && (code == COND_EXPR
3216           || !def2 || gimple_nop_p (def2)
3217           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3218           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3219               && (is_gimple_assign (def2)
3220                   || is_gimple_call (def2)
3221                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3222                        == vect_induction_def
3223                   || (gimple_code (def2) == GIMPLE_PHI
3224                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3225                            == vect_internal_def
3226                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3227     {
3228       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3229         {
3230           /* Check if we can swap operands (just for simplicity - so that
3231              the rest of the code can assume that the reduction variable
3232              is always the last (second) argument).  */
3233           if (code == COND_EXPR)
3234             {
3235               /* Swap cond_expr by inverting the condition.  */
3236               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3237               enum tree_code invert_code = ERROR_MARK;
3238               enum tree_code cond_code = TREE_CODE (cond_expr);
3239
3240               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3241                 {
3242                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3243                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3244                 }
3245               if (invert_code != ERROR_MARK)
3246                 {
3247                   TREE_SET_CODE (cond_expr, invert_code);
3248                   swap_ssa_operands (def_stmt,
3249                                      gimple_assign_rhs2_ptr (def_stmt),
3250                                      gimple_assign_rhs3_ptr (def_stmt));
3251                 }
3252               else
3253                 {
3254                   if (dump_enabled_p ())
3255                     report_vect_op (MSG_NOTE, def_stmt,
3256                                     "detected reduction: cannot swap operands "
3257                                     "for cond_expr");
3258                   return NULL;
3259                 }
3260             }
3261           else
3262             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3263                                gimple_assign_rhs2_ptr (def_stmt));
3264
3265           if (dump_enabled_p ())
3266             report_vect_op (MSG_NOTE, def_stmt,
3267                             "detected reduction: need to swap operands: ");
3268
3269           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3270             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3271         }
3272       else
3273         {
3274           if (dump_enabled_p ())
3275             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3276         }
3277
3278       return def_stmt;
3279     }
3280
3281   /* Try to find SLP reduction chain.  */
3282   if (! nested_in_vect_loop
3283       && code != COND_EXPR
3284       && orig_code != MINUS_EXPR
3285       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3286     {
3287       if (dump_enabled_p ())
3288         report_vect_op (MSG_NOTE, def_stmt,
3289                         "reduction: detected reduction chain: ");
3290
3291       return def_stmt;
3292     }
3293
3294   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3295   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3296   while (first)
3297     {
3298       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3299       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3300       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3301       first = next;
3302     }
3303
3304   /* Look for the expression computing loop_arg from loop PHI result.  */
3305   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3306                             code))
3307     return def_stmt;
3308
3309   if (dump_enabled_p ())
3310     {
3311       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3312                       "reduction: unknown pattern: ");
3313     }
3314
3315   return NULL;
3316 }
3317
3318 /* Wrapper around vect_is_simple_reduction, which will modify code
3319    in-place if it enables detection of more reductions.  Arguments
3320    as there.  */
3321
3322 gimple *
3323 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3324                              bool *double_reduc,
3325                              bool need_wrapping_integral_overflow)
3326 {
3327   enum vect_reduction_type v_reduc_type;
3328   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3329                                           need_wrapping_integral_overflow,
3330                                           &v_reduc_type);
3331   if (def)
3332     {
3333       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3334       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3335       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3336       reduc_def_info = vinfo_for_stmt (def);
3337       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3338       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3339     }
3340   return def;
3341 }
3342
3343 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3344 int
3345 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3346                              int *peel_iters_epilogue,
3347                              stmt_vector_for_cost *scalar_cost_vec,
3348                              stmt_vector_for_cost *prologue_cost_vec,
3349                              stmt_vector_for_cost *epilogue_cost_vec)
3350 {
3351   int retval = 0;
3352   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3353
3354   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3355     {
3356       *peel_iters_epilogue = assumed_vf / 2;
3357       if (dump_enabled_p ())
3358         dump_printf_loc (MSG_NOTE, vect_location,
3359                          "cost model: epilogue peel iters set to vf/2 "
3360                          "because loop iterations are unknown .\n");
3361
3362       /* If peeled iterations are known but number of scalar loop
3363          iterations are unknown, count a taken branch per peeled loop.  */
3364       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3365                                  NULL, 0, vect_prologue);
3366       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3367                                  NULL, 0, vect_epilogue);
3368     }
3369   else
3370     {
3371       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3372       peel_iters_prologue = niters < peel_iters_prologue ?
3373                             niters : peel_iters_prologue;
3374       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3375       /* If we need to peel for gaps, but no peeling is required, we have to
3376          peel VF iterations.  */
3377       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3378         *peel_iters_epilogue = assumed_vf;
3379     }
3380
3381   stmt_info_for_cost *si;
3382   int j;
3383   if (peel_iters_prologue)
3384     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3385         {
3386           stmt_vec_info stmt_info
3387             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3388           retval += record_stmt_cost (prologue_cost_vec,
3389                                       si->count * peel_iters_prologue,
3390                                       si->kind, stmt_info, si->misalign,
3391                                       vect_prologue);
3392         }
3393   if (*peel_iters_epilogue)
3394     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3395         {
3396           stmt_vec_info stmt_info
3397             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3398           retval += record_stmt_cost (epilogue_cost_vec,
3399                                       si->count * *peel_iters_epilogue,
3400                                       si->kind, stmt_info, si->misalign,
3401                                       vect_epilogue);
3402         }
3403
3404   return retval;
3405 }
3406
3407 /* Function vect_estimate_min_profitable_iters
3408
3409    Return the number of iterations required for the vector version of the
3410    loop to be profitable relative to the cost of the scalar version of the
3411    loop.
3412
3413    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3414    of iterations for vectorization.  -1 value means loop vectorization
3415    is not profitable.  This returned value may be used for dynamic
3416    profitability check.
3417
3418    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3419    for static check against estimated number of iterations.  */
3420
3421 static void
3422 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3423                                     int *ret_min_profitable_niters,
3424                                     int *ret_min_profitable_estimate)
3425 {
3426   int min_profitable_iters;
3427   int min_profitable_estimate;
3428   int peel_iters_prologue;
3429   int peel_iters_epilogue;
3430   unsigned vec_inside_cost = 0;
3431   int vec_outside_cost = 0;
3432   unsigned vec_prologue_cost = 0;
3433   unsigned vec_epilogue_cost = 0;
3434   int scalar_single_iter_cost = 0;
3435   int scalar_outside_cost = 0;
3436   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3437   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3438   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3439
3440   /* Cost model disabled.  */
3441   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3442     {
3443       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3444       *ret_min_profitable_niters = 0;
3445       *ret_min_profitable_estimate = 0;
3446       return;
3447     }
3448
3449   /* Requires loop versioning tests to handle misalignment.  */
3450   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3451     {
3452       /*  FIXME: Make cost depend on complexity of individual check.  */
3453       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3454       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3455                             vect_prologue);
3456       dump_printf (MSG_NOTE,
3457                    "cost model: Adding cost of checks for loop "
3458                    "versioning to treat misalignment.\n");
3459     }
3460
3461   /* Requires loop versioning with alias checks.  */
3462   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3463     {
3464       /*  FIXME: Make cost depend on complexity of individual check.  */
3465       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3466       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3467                             vect_prologue);
3468       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3469       if (len)
3470         /* Count LEN - 1 ANDs and LEN comparisons.  */
3471         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3472                               NULL, 0, vect_prologue);
3473       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3474       if (len)
3475         {
3476           /* Count LEN - 1 ANDs and LEN comparisons.  */
3477           unsigned int nstmts = len * 2 - 1;
3478           /* +1 for each bias that needs adding.  */
3479           for (unsigned int i = 0; i < len; ++i)
3480             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3481               nstmts += 1;
3482           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3483                                 NULL, 0, vect_prologue);
3484         }
3485       dump_printf (MSG_NOTE,
3486                    "cost model: Adding cost of checks for loop "
3487                    "versioning aliasing.\n");
3488     }
3489
3490   /* Requires loop versioning with niter checks.  */
3491   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3492     {
3493       /*  FIXME: Make cost depend on complexity of individual check.  */
3494       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3495                             vect_prologue);
3496       dump_printf (MSG_NOTE,
3497                    "cost model: Adding cost of checks for loop "
3498                    "versioning niters.\n");
3499     }
3500
3501   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3502     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3503                           vect_prologue);
3504
3505   /* Count statements in scalar loop.  Using this as scalar cost for a single
3506      iteration for now.
3507
3508      TODO: Add outer loop support.
3509
3510      TODO: Consider assigning different costs to different scalar
3511      statements.  */
3512
3513   scalar_single_iter_cost
3514     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3515
3516   /* Add additional cost for the peeled instructions in prologue and epilogue
3517      loop.  (For fully-masked loops there will be no peeling.)
3518
3519      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3520      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3521
3522      TODO: Build an expression that represents peel_iters for prologue and
3523      epilogue to be used in a run-time test.  */
3524
3525   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3526     {
3527       peel_iters_prologue = 0;
3528       peel_iters_epilogue = 0;
3529
3530       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3531         {
3532           /* We need to peel exactly one iteration.  */
3533           peel_iters_epilogue += 1;
3534           stmt_info_for_cost *si;
3535           int j;
3536           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3537                             j, si)
3538             {
3539               struct _stmt_vec_info *stmt_info
3540                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3541               (void) add_stmt_cost (target_cost_data, si->count,
3542                                     si->kind, stmt_info, si->misalign,
3543                                     vect_epilogue);
3544             }
3545         }
3546     }
3547   else if (npeel < 0)
3548     {
3549       peel_iters_prologue = assumed_vf / 2;
3550       dump_printf (MSG_NOTE, "cost model: "
3551                    "prologue peel iters set to vf/2.\n");
3552
3553       /* If peeling for alignment is unknown, loop bound of main loop becomes
3554          unknown.  */
3555       peel_iters_epilogue = assumed_vf / 2;
3556       dump_printf (MSG_NOTE, "cost model: "
3557                    "epilogue peel iters set to vf/2 because "
3558                    "peeling for alignment is unknown.\n");
3559
3560       /* If peeled iterations are unknown, count a taken branch and a not taken
3561          branch per peeled loop. Even if scalar loop iterations are known,
3562          vector iterations are not known since peeled prologue iterations are
3563          not known. Hence guards remain the same.  */
3564       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3565                             NULL, 0, vect_prologue);
3566       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3567                             NULL, 0, vect_prologue);
3568       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3569                             NULL, 0, vect_epilogue);
3570       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3571                             NULL, 0, vect_epilogue);
3572       stmt_info_for_cost *si;
3573       int j;
3574       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3575         {
3576           struct _stmt_vec_info *stmt_info
3577             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3578           (void) add_stmt_cost (target_cost_data,
3579                                 si->count * peel_iters_prologue,
3580                                 si->kind, stmt_info, si->misalign,
3581                                 vect_prologue);
3582           (void) add_stmt_cost (target_cost_data,
3583                                 si->count * peel_iters_epilogue,
3584                                 si->kind, stmt_info, si->misalign,
3585                                 vect_epilogue);
3586         }
3587     }
3588   else
3589     {
3590       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3591       stmt_info_for_cost *si;
3592       int j;
3593       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3594
3595       prologue_cost_vec.create (2);
3596       epilogue_cost_vec.create (2);
3597       peel_iters_prologue = npeel;
3598
3599       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3600                                           &peel_iters_epilogue,
3601                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3602                                             (loop_vinfo),
3603                                           &prologue_cost_vec,
3604                                           &epilogue_cost_vec);
3605
3606       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3607         {
3608           struct _stmt_vec_info *stmt_info
3609             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3610           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3611                                 si->misalign, vect_prologue);
3612         }
3613
3614       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3615         {
3616           struct _stmt_vec_info *stmt_info
3617             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3618           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3619                                 si->misalign, vect_epilogue);
3620         }
3621
3622       prologue_cost_vec.release ();
3623       epilogue_cost_vec.release ();
3624     }
3625
3626   /* FORNOW: The scalar outside cost is incremented in one of the
3627      following ways:
3628
3629      1. The vectorizer checks for alignment and aliasing and generates
3630      a condition that allows dynamic vectorization.  A cost model
3631      check is ANDED with the versioning condition.  Hence scalar code
3632      path now has the added cost of the versioning check.
3633
3634        if (cost > th & versioning_check)
3635          jmp to vector code
3636
3637      Hence run-time scalar is incremented by not-taken branch cost.
3638
3639      2. The vectorizer then checks if a prologue is required.  If the
3640      cost model check was not done before during versioning, it has to
3641      be done before the prologue check.
3642
3643        if (cost <= th)
3644          prologue = scalar_iters
3645        if (prologue == 0)
3646          jmp to vector code
3647        else
3648          execute prologue
3649        if (prologue == num_iters)
3650          go to exit
3651
3652      Hence the run-time scalar cost is incremented by a taken branch,
3653      plus a not-taken branch, plus a taken branch cost.
3654
3655      3. The vectorizer then checks if an epilogue is required.  If the
3656      cost model check was not done before during prologue check, it
3657      has to be done with the epilogue check.
3658
3659        if (prologue == 0)
3660          jmp to vector code
3661        else
3662          execute prologue
3663        if (prologue == num_iters)
3664          go to exit
3665        vector code:
3666          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3667            jmp to epilogue
3668
3669      Hence the run-time scalar cost should be incremented by 2 taken
3670      branches.
3671
3672      TODO: The back end may reorder the BBS's differently and reverse
3673      conditions/branch directions.  Change the estimates below to
3674      something more reasonable.  */
3675
3676   /* If the number of iterations is known and we do not do versioning, we can
3677      decide whether to vectorize at compile time.  Hence the scalar version
3678      do not carry cost model guard costs.  */
3679   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3680       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3681     {
3682       /* Cost model check occurs at versioning.  */
3683       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3684         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3685       else
3686         {
3687           /* Cost model check occurs at prologue generation.  */
3688           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3689             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3690               + vect_get_stmt_cost (cond_branch_not_taken);
3691           /* Cost model check occurs at epilogue generation.  */
3692           else
3693             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3694         }
3695     }
3696
3697   /* Complete the target-specific cost calculations.  */
3698   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3699                &vec_inside_cost, &vec_epilogue_cost);
3700
3701   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3702
3703   if (dump_enabled_p ())
3704     {
3705       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3706       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3707                    vec_inside_cost);
3708       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3709                    vec_prologue_cost);
3710       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3711                    vec_epilogue_cost);
3712       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3713                    scalar_single_iter_cost);
3714       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3715                    scalar_outside_cost);
3716       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3717                    vec_outside_cost);
3718       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3719                    peel_iters_prologue);
3720       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3721                    peel_iters_epilogue);
3722     }
3723
3724   /* Calculate number of iterations required to make the vector version
3725      profitable, relative to the loop bodies only.  The following condition
3726      must hold true:
3727      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3728      where
3729      SIC = scalar iteration cost, VIC = vector iteration cost,
3730      VOC = vector outside cost, VF = vectorization factor,
3731      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3732      SOC = scalar outside cost for run time cost model check.  */
3733
3734   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3735     {
3736       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3737                               * assumed_vf
3738                               - vec_inside_cost * peel_iters_prologue
3739                               - vec_inside_cost * peel_iters_epilogue);
3740       if (min_profitable_iters <= 0)
3741         min_profitable_iters = 0;
3742       else
3743         {
3744           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3745                                    - vec_inside_cost);
3746
3747           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3748               <= (((int) vec_inside_cost * min_profitable_iters)
3749                   + (((int) vec_outside_cost - scalar_outside_cost)
3750                      * assumed_vf)))
3751             min_profitable_iters++;
3752         }
3753     }
3754   /* vector version will never be profitable.  */
3755   else
3756     {
3757       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3758         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3759                     "vectorization did not happen for a simd loop");
3760
3761       if (dump_enabled_p ())
3762         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3763                          "cost model: the vector iteration cost = %d "
3764                          "divided by the scalar iteration cost = %d "
3765                          "is greater or equal to the vectorization factor = %d"
3766                          ".\n",
3767                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3768       *ret_min_profitable_niters = -1;
3769       *ret_min_profitable_estimate = -1;
3770       return;
3771     }
3772
3773   dump_printf (MSG_NOTE,
3774                "  Calculated minimum iters for profitability: %d\n",
3775                min_profitable_iters);
3776
3777   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3778       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3779     /* We want the vectorized loop to execute at least once.  */
3780     min_profitable_iters = assumed_vf + peel_iters_prologue;
3781
3782   if (dump_enabled_p ())
3783     dump_printf_loc (MSG_NOTE, vect_location,
3784                      "  Runtime profitability threshold = %d\n",
3785                      min_profitable_iters);
3786
3787   *ret_min_profitable_niters = min_profitable_iters;
3788
3789   /* Calculate number of iterations required to make the vector version
3790      profitable, relative to the loop bodies only.
3791
3792      Non-vectorized variant is SIC * niters and it must win over vector
3793      variant on the expected loop trip count.  The following condition must hold true:
3794      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3795
3796   if (vec_outside_cost <= 0)
3797     min_profitable_estimate = 0;
3798   else
3799     {
3800       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3801                                  * assumed_vf
3802                                  - vec_inside_cost * peel_iters_prologue
3803                                  - vec_inside_cost * peel_iters_epilogue)
3804                                  / ((scalar_single_iter_cost * assumed_vf)
3805                                    - vec_inside_cost);
3806     }
3807   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3808   if (dump_enabled_p ())
3809     dump_printf_loc (MSG_NOTE, vect_location,
3810                      "  Static estimate profitability threshold = %d\n",
3811                      min_profitable_estimate);
3812
3813   *ret_min_profitable_estimate = min_profitable_estimate;
3814 }
3815
3816 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3817    vector elements (not bits) for a vector with NELT elements.  */
3818 static void
3819 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3820                               vec_perm_builder *sel)
3821 {
3822   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3823      by vec_perm_indices.  */
3824   sel->new_vector (nelt, 1, 3);
3825   for (unsigned int i = 0; i < 3; i++)
3826     sel->quick_push (i + offset);
3827 }
3828
3829 /* Checks whether the target supports whole-vector shifts for vectors of mode
3830    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3831    it supports vec_perm_const with masks for all necessary shift amounts.  */
3832 static bool
3833 have_whole_vector_shift (machine_mode mode)
3834 {
3835   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3836     return true;
3837
3838   /* Variable-length vectors should be handled via the optab.  */
3839   unsigned int nelt;
3840   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3841     return false;
3842
3843   vec_perm_builder sel;
3844   vec_perm_indices indices;
3845   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3846     {
3847       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3848       indices.new_vector (sel, 2, nelt);
3849       if (!can_vec_perm_const_p (mode, indices, false))
3850         return false;
3851     }
3852   return true;
3853 }
3854
3855 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3856    functions. Design better to avoid maintenance issues.  */
3857
3858 /* Function vect_model_reduction_cost.
3859
3860    Models cost for a reduction operation, including the vector ops
3861    generated within the strip-mine loop, the initial definition before
3862    the loop, and the epilogue code that must be generated.  */
3863
3864 static void
3865 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3866                            int ncopies, stmt_vector_for_cost *cost_vec)
3867 {
3868   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3869   enum tree_code code;
3870   optab optab;
3871   tree vectype;
3872   gimple *orig_stmt;
3873   machine_mode mode;
3874   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3875   struct loop *loop = NULL;
3876
3877   if (loop_vinfo)
3878     loop = LOOP_VINFO_LOOP (loop_vinfo);
3879
3880   /* Condition reductions generate two reductions in the loop.  */
3881   vect_reduction_type reduction_type
3882     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3883   if (reduction_type == COND_REDUCTION)
3884     ncopies *= 2;
3885
3886   vectype = STMT_VINFO_VECTYPE (stmt_info);
3887   mode = TYPE_MODE (vectype);
3888   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3889
3890   if (!orig_stmt)
3891     orig_stmt = STMT_VINFO_STMT (stmt_info);
3892
3893   code = gimple_assign_rhs_code (orig_stmt);
3894
3895   if (reduction_type == EXTRACT_LAST_REDUCTION
3896       || reduction_type == FOLD_LEFT_REDUCTION)
3897     {
3898       /* No extra instructions needed in the prologue.  */
3899       prologue_cost = 0;
3900
3901       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3902         /* Count one reduction-like operation per vector.  */
3903         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3904                                         stmt_info, 0, vect_body);
3905       else
3906         {
3907           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3908           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3909           inside_cost = record_stmt_cost (cost_vec, nelements,
3910                                           vec_to_scalar, stmt_info, 0,
3911                                           vect_body);
3912           inside_cost += record_stmt_cost (cost_vec, nelements,
3913                                            scalar_stmt, stmt_info, 0,
3914                                            vect_body);
3915         }
3916     }
3917   else
3918     {
3919       /* Add in cost for initial definition.
3920          For cond reduction we have four vectors: initial index, step,
3921          initial result of the data reduction, initial value of the index
3922          reduction.  */
3923       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3924       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3925                                          scalar_to_vec, stmt_info, 0,
3926                                          vect_prologue);
3927
3928       /* Cost of reduction op inside loop.  */
3929       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3930                                       stmt_info, 0, vect_body);
3931     }
3932
3933   /* Determine cost of epilogue code.
3934
3935      We have a reduction operator that will reduce the vector in one statement.
3936      Also requires scalar extract.  */
3937
3938   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3939     {
3940       if (reduc_fn != IFN_LAST)
3941         {
3942           if (reduction_type == COND_REDUCTION)
3943             {
3944               /* An EQ stmt and an COND_EXPR stmt.  */
3945               epilogue_cost += record_stmt_cost (cost_vec, 2,
3946                                                  vector_stmt, stmt_info, 0,
3947                                                  vect_epilogue);
3948               /* Reduction of the max index and a reduction of the found
3949                  values.  */
3950               epilogue_cost += record_stmt_cost (cost_vec, 2,
3951                                                  vec_to_scalar, stmt_info, 0,
3952                                                  vect_epilogue);
3953               /* A broadcast of the max value.  */
3954               epilogue_cost += record_stmt_cost (cost_vec, 1,
3955                                                  scalar_to_vec, stmt_info, 0,
3956                                                  vect_epilogue);
3957             }
3958           else
3959             {
3960               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3961                                                  stmt_info, 0, vect_epilogue);
3962               epilogue_cost += record_stmt_cost (cost_vec, 1,
3963                                                  vec_to_scalar, stmt_info, 0,
3964                                                  vect_epilogue);
3965             }
3966         }
3967       else if (reduction_type == COND_REDUCTION)
3968         {
3969           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3970           /* Extraction of scalar elements.  */
3971           epilogue_cost += record_stmt_cost (cost_vec,
3972                                              2 * estimated_nunits,
3973                                              vec_to_scalar, stmt_info, 0,
3974                                              vect_epilogue);
3975           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3976           epilogue_cost += record_stmt_cost (cost_vec,
3977                                              2 * estimated_nunits - 3,
3978                                              scalar_stmt, stmt_info, 0,
3979                                              vect_epilogue);
3980         }
3981       else if (reduction_type == EXTRACT_LAST_REDUCTION
3982                || reduction_type == FOLD_LEFT_REDUCTION)
3983         /* No extra instructions need in the epilogue.  */
3984         ;
3985       else
3986         {
3987           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3988           tree bitsize =
3989             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3990           int element_bitsize = tree_to_uhwi (bitsize);
3991           int nelements = vec_size_in_bits / element_bitsize;
3992
3993           if (code == COND_EXPR)
3994             code = MAX_EXPR;
3995
3996           optab = optab_for_tree_code (code, vectype, optab_default);
3997
3998           /* We have a whole vector shift available.  */
3999           if (optab != unknown_optab
4000               && VECTOR_MODE_P (mode)
4001               && optab_handler (optab, mode) != CODE_FOR_nothing
4002               && have_whole_vector_shift (mode))
4003             {
4004               /* Final reduction via vector shifts and the reduction operator.
4005                  Also requires scalar extract.  */
4006               epilogue_cost += record_stmt_cost (cost_vec,
4007                                                  exact_log2 (nelements) * 2,
4008                                                  vector_stmt, stmt_info, 0,
4009                                                  vect_epilogue);
4010               epilogue_cost += record_stmt_cost (cost_vec, 1,
4011                                                  vec_to_scalar, stmt_info, 0,
4012                                                  vect_epilogue);
4013             }
4014           else
4015             /* Use extracts and reduction op for final reduction.  For N
4016                elements, we have N extracts and N-1 reduction ops.  */
4017             epilogue_cost += record_stmt_cost (cost_vec,
4018                                                nelements + nelements - 1,
4019                                                vector_stmt, stmt_info, 0,
4020                                                vect_epilogue);
4021         }
4022     }
4023
4024   if (dump_enabled_p ())
4025     dump_printf (MSG_NOTE,
4026                  "vect_model_reduction_cost: inside_cost = %d, "
4027                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4028                  prologue_cost, epilogue_cost);
4029 }
4030
4031
4032 /* Function vect_model_induction_cost.
4033
4034    Models cost for induction operations.  */
4035
4036 static void
4037 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4038                            stmt_vector_for_cost *cost_vec)
4039 {
4040   unsigned inside_cost, prologue_cost;
4041
4042   if (PURE_SLP_STMT (stmt_info))
4043     return;
4044
4045   /* loop cost for vec_loop.  */
4046   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4047                                   stmt_info, 0, vect_body);
4048
4049   /* prologue cost for vec_init and vec_step.  */
4050   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4051                                     stmt_info, 0, vect_prologue);
4052
4053   if (dump_enabled_p ())
4054     dump_printf_loc (MSG_NOTE, vect_location,
4055                      "vect_model_induction_cost: inside_cost = %d, "
4056                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4057 }
4058
4059
4060
4061 /* Function get_initial_def_for_reduction
4062
4063    Input:
4064    STMT - a stmt that performs a reduction operation in the loop.
4065    INIT_VAL - the initial value of the reduction variable
4066
4067    Output:
4068    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4069         of the reduction (used for adjusting the epilog - see below).
4070    Return a vector variable, initialized according to the operation that STMT
4071         performs. This vector will be used as the initial value of the
4072         vector of partial results.
4073
4074    Option1 (adjust in epilog): Initialize the vector as follows:
4075      add/bit or/xor:    [0,0,...,0,0]
4076      mult/bit and:      [1,1,...,1,1]
4077      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4078    and when necessary (e.g. add/mult case) let the caller know
4079    that it needs to adjust the result by init_val.
4080
4081    Option2: Initialize the vector as follows:
4082      add/bit or/xor:    [init_val,0,0,...,0]
4083      mult/bit and:      [init_val,1,1,...,1]
4084      min/max/cond_expr: [init_val,init_val,...,init_val]
4085    and no adjustments are needed.
4086
4087    For example, for the following code:
4088
4089    s = init_val;
4090    for (i=0;i<n;i++)
4091      s = s + a[i];
4092
4093    STMT is 's = s + a[i]', and the reduction variable is 's'.
4094    For a vector of 4 units, we want to return either [0,0,0,init_val],
4095    or [0,0,0,0] and let the caller know that it needs to adjust
4096    the result at the end by 'init_val'.
4097
4098    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4099    initialization vector is simpler (same element in all entries), if
4100    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4101
4102    A cost model should help decide between these two schemes.  */
4103
4104 tree
4105 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4106                                tree *adjustment_def)
4107 {
4108   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4109   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4110   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4111   tree scalar_type = TREE_TYPE (init_val);
4112   tree vectype = get_vectype_for_scalar_type (scalar_type);
4113   enum tree_code code = gimple_assign_rhs_code (stmt);
4114   tree def_for_init;
4115   tree init_def;
4116   REAL_VALUE_TYPE real_init_val = dconst0;
4117   int int_init_val = 0;
4118   gimple_seq stmts = NULL;
4119
4120   gcc_assert (vectype);
4121
4122   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4123               || SCALAR_FLOAT_TYPE_P (scalar_type));
4124
4125   gcc_assert (nested_in_vect_loop_p (loop, stmt)
4126               || loop == (gimple_bb (stmt))->loop_father);
4127
4128   vect_reduction_type reduction_type
4129     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4130
4131   switch (code)
4132     {
4133     case WIDEN_SUM_EXPR:
4134     case DOT_PROD_EXPR:
4135     case SAD_EXPR:
4136     case PLUS_EXPR:
4137     case MINUS_EXPR:
4138     case BIT_IOR_EXPR:
4139     case BIT_XOR_EXPR:
4140     case MULT_EXPR:
4141     case BIT_AND_EXPR:
4142       {
4143         /* ADJUSTMENT_DEF is NULL when called from
4144            vect_create_epilog_for_reduction to vectorize double reduction.  */
4145         if (adjustment_def)
4146           *adjustment_def = init_val;
4147
4148         if (code == MULT_EXPR)
4149           {
4150             real_init_val = dconst1;
4151             int_init_val = 1;
4152           }
4153
4154         if (code == BIT_AND_EXPR)
4155           int_init_val = -1;
4156
4157         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4158           def_for_init = build_real (scalar_type, real_init_val);
4159         else
4160           def_for_init = build_int_cst (scalar_type, int_init_val);
4161
4162         if (adjustment_def)
4163           /* Option1: the first element is '0' or '1' as well.  */
4164           init_def = gimple_build_vector_from_val (&stmts, vectype,
4165                                                    def_for_init);
4166         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4167           {
4168             /* Option2 (variable length): the first element is INIT_VAL.  */
4169             init_def = gimple_build_vector_from_val (&stmts, vectype,
4170                                                      def_for_init);
4171             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4172                                      vectype, init_def, init_val);
4173           }
4174         else
4175           {
4176             /* Option2: the first element is INIT_VAL.  */
4177             tree_vector_builder elts (vectype, 1, 2);
4178             elts.quick_push (init_val);
4179             elts.quick_push (def_for_init);
4180             init_def = gimple_build_vector (&stmts, &elts);
4181           }
4182       }
4183       break;
4184
4185     case MIN_EXPR:
4186     case MAX_EXPR:
4187     case COND_EXPR:
4188       {
4189         if (adjustment_def)
4190           {
4191             *adjustment_def = NULL_TREE;
4192             if (reduction_type != COND_REDUCTION
4193                 && reduction_type != EXTRACT_LAST_REDUCTION)
4194               {
4195                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4196                 break;
4197               }
4198           }
4199         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4200         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4201       }
4202       break;
4203
4204     default:
4205       gcc_unreachable ();
4206     }
4207
4208   if (stmts)
4209     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4210   return init_def;
4211 }
4212
4213 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4214    NUMBER_OF_VECTORS is the number of vector defs to create.
4215    If NEUTRAL_OP is nonnull, introducing extra elements of that
4216    value will not change the result.  */
4217
4218 static void
4219 get_initial_defs_for_reduction (slp_tree slp_node,
4220                                 vec<tree> *vec_oprnds,
4221                                 unsigned int number_of_vectors,
4222                                 bool reduc_chain, tree neutral_op)
4223 {
4224   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4225   gimple *stmt = stmts[0];
4226   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4227   unsigned HOST_WIDE_INT nunits;
4228   unsigned j, number_of_places_left_in_vector;
4229   tree vector_type;
4230   tree vop;
4231   int group_size = stmts.length ();
4232   unsigned int vec_num, i;
4233   unsigned number_of_copies = 1;
4234   vec<tree> voprnds;
4235   voprnds.create (number_of_vectors);
4236   struct loop *loop;
4237   auto_vec<tree, 16> permute_results;
4238
4239   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4240
4241   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4242
4243   loop = (gimple_bb (stmt))->loop_father;
4244   gcc_assert (loop);
4245   edge pe = loop_preheader_edge (loop);
4246
4247   gcc_assert (!reduc_chain || neutral_op);
4248
4249   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4250      created vectors. It is greater than 1 if unrolling is performed.
4251
4252      For example, we have two scalar operands, s1 and s2 (e.g., group of
4253      strided accesses of size two), while NUNITS is four (i.e., four scalars
4254      of this type can be packed in a vector).  The output vector will contain
4255      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4256      will be 2).
4257
4258      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4259      vectors containing the operands.
4260
4261      For example, NUNITS is four as before, and the group size is 8
4262      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4263      {s5, s6, s7, s8}.  */
4264
4265   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4266     nunits = group_size;
4267
4268   number_of_copies = nunits * number_of_vectors / group_size;
4269
4270   number_of_places_left_in_vector = nunits;
4271   bool constant_p = true;
4272   tree_vector_builder elts (vector_type, nunits, 1);
4273   elts.quick_grow (nunits);
4274   for (j = 0; j < number_of_copies; j++)
4275     {
4276       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4277         {
4278           tree op;
4279           /* Get the def before the loop.  In reduction chain we have only
4280              one initial value.  */
4281           if ((j != (number_of_copies - 1)
4282                || (reduc_chain && i != 0))
4283               && neutral_op)
4284             op = neutral_op;
4285           else
4286             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4287
4288           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4289           number_of_places_left_in_vector--;
4290           elts[number_of_places_left_in_vector] = op;
4291           if (!CONSTANT_CLASS_P (op))
4292             constant_p = false;
4293
4294           if (number_of_places_left_in_vector == 0)
4295             {
4296               gimple_seq ctor_seq = NULL;
4297               tree init;
4298               if (constant_p && !neutral_op
4299                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4300                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4301                 /* Build the vector directly from ELTS.  */
4302                 init = gimple_build_vector (&ctor_seq, &elts);
4303               else if (neutral_op)
4304                 {
4305                   /* Build a vector of the neutral value and shift the
4306                      other elements into place.  */
4307                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4308                                                        neutral_op);
4309                   int k = nunits;
4310                   while (k > 0 && elts[k - 1] == neutral_op)
4311                     k -= 1;
4312                   while (k > 0)
4313                     {
4314                       k -= 1;
4315                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4316                                            vector_type, init, elts[k]);
4317                     }
4318                 }
4319               else
4320                 {
4321                   /* First time round, duplicate ELTS to fill the
4322                      required number of vectors, then cherry pick the
4323                      appropriate result for each iteration.  */
4324                   if (vec_oprnds->is_empty ())
4325                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4326                                               number_of_vectors,
4327                                               permute_results);
4328                   init = permute_results[number_of_vectors - j - 1];
4329                 }
4330               if (ctor_seq != NULL)
4331                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4332               voprnds.quick_push (init);
4333
4334               number_of_places_left_in_vector = nunits;
4335               elts.new_vector (vector_type, nunits, 1);
4336               elts.quick_grow (nunits);
4337               constant_p = true;
4338             }
4339         }
4340     }
4341
4342   /* Since the vectors are created in the reverse order, we should invert
4343      them.  */
4344   vec_num = voprnds.length ();
4345   for (j = vec_num; j != 0; j--)
4346     {
4347       vop = voprnds[j - 1];
4348       vec_oprnds->quick_push (vop);
4349     }
4350
4351   voprnds.release ();
4352
4353   /* In case that VF is greater than the unrolling factor needed for the SLP
4354      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4355      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4356      to replicate the vectors.  */
4357   tree neutral_vec = NULL;
4358   while (number_of_vectors > vec_oprnds->length ())
4359     {
4360       if (neutral_op)
4361         {
4362           if (!neutral_vec)
4363             {
4364               gimple_seq ctor_seq = NULL;
4365               neutral_vec = gimple_build_vector_from_val
4366                 (&ctor_seq, vector_type, neutral_op);
4367               if (ctor_seq != NULL)
4368                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4369             }
4370           vec_oprnds->quick_push (neutral_vec);
4371         }
4372       else
4373         {
4374           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4375             vec_oprnds->quick_push (vop);
4376         }
4377     }
4378 }
4379
4380
4381 /* Function vect_create_epilog_for_reduction
4382
4383    Create code at the loop-epilog to finalize the result of a reduction
4384    computation.
4385
4386    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4387      reduction statements.
4388    STMT is the scalar reduction stmt that is being vectorized.
4389    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4390      number of elements that we can fit in a vectype (nunits).  In this case
4391      we have to generate more than one vector stmt - i.e - we need to "unroll"
4392      the vector stmt by a factor VF/nunits.  For more details see documentation
4393      in vectorizable_operation.
4394    REDUC_FN is the internal function for the epilog reduction.
4395    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4396      computation.
4397    REDUC_INDEX is the index of the operand in the right hand side of the
4398      statement that is defined by REDUCTION_PHI.
4399    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4400    SLP_NODE is an SLP node containing a group of reduction statements. The
4401      first one in this group is STMT.
4402    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4403      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4404      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4405      any value of the IV in the loop.
4406    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4407    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4408      null if this is not an SLP reduction
4409
4410    This function:
4411    1. Creates the reduction def-use cycles: sets the arguments for
4412       REDUCTION_PHIS:
4413       The loop-entry argument is the vectorized initial-value of the reduction.
4414       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4415       sums.
4416    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4417       by calling the function specified by REDUC_FN if available, or by
4418       other means (whole-vector shifts or a scalar loop).
4419       The function also creates a new phi node at the loop exit to preserve
4420       loop-closed form, as illustrated below.
4421
4422      The flow at the entry to this function:
4423
4424         loop:
4425           vec_def = phi <null, null>            # REDUCTION_PHI
4426           VECT_DEF = vector_stmt                # vectorized form of STMT
4427           s_loop = scalar_stmt                  # (scalar) STMT
4428         loop_exit:
4429           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4430           use <s_out0>
4431           use <s_out0>
4432
4433      The above is transformed by this function into:
4434
4435         loop:
4436           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4437           VECT_DEF = vector_stmt                # vectorized form of STMT
4438           s_loop = scalar_stmt                  # (scalar) STMT
4439         loop_exit:
4440           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4441           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4442           v_out2 = reduce <v_out1>
4443           s_out3 = extract_field <v_out2, 0>
4444           s_out4 = adjust_result <s_out3>
4445           use <s_out4>
4446           use <s_out4>
4447 */
4448
4449 static void
4450 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4451                                   gimple *reduc_def_stmt,
4452                                   int ncopies, internal_fn reduc_fn,
4453                                   vec<gimple *> reduction_phis,
4454                                   bool double_reduc,
4455                                   slp_tree slp_node,
4456                                   slp_instance slp_node_instance,
4457                                   tree induc_val, enum tree_code induc_code,
4458                                   tree neutral_op)
4459 {
4460   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4461   stmt_vec_info prev_phi_info;
4462   tree vectype;
4463   machine_mode mode;
4464   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4465   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4466   basic_block exit_bb;
4467   tree scalar_dest;
4468   tree scalar_type;
4469   gimple *new_phi = NULL, *phi;
4470   gimple_stmt_iterator exit_gsi;
4471   tree vec_dest;
4472   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4473   gimple *epilog_stmt = NULL;
4474   enum tree_code code = gimple_assign_rhs_code (stmt);
4475   gimple *exit_phi;
4476   tree bitsize;
4477   tree adjustment_def = NULL;
4478   tree vec_initial_def = NULL;
4479   tree expr, def, initial_def = NULL;
4480   tree orig_name, scalar_result;
4481   imm_use_iterator imm_iter, phi_imm_iter;
4482   use_operand_p use_p, phi_use_p;
4483   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4484   bool nested_in_vect_loop = false;
4485   auto_vec<gimple *> new_phis;
4486   auto_vec<gimple *> inner_phis;
4487   enum vect_def_type dt = vect_unknown_def_type;
4488   int j, i;
4489   auto_vec<tree> scalar_results;
4490   unsigned int group_size = 1, k, ratio;
4491   auto_vec<tree> vec_initial_defs;
4492   auto_vec<gimple *> phis;
4493   bool slp_reduc = false;
4494   bool direct_slp_reduc;
4495   tree new_phi_result;
4496   gimple *inner_phi = NULL;
4497   tree induction_index = NULL_TREE;
4498
4499   if (slp_node)
4500     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4501
4502   if (nested_in_vect_loop_p (loop, stmt))
4503     {
4504       outer_loop = loop;
4505       loop = loop->inner;
4506       nested_in_vect_loop = true;
4507       gcc_assert (!slp_node);
4508     }
4509
4510   vectype = STMT_VINFO_VECTYPE (stmt_info);
4511   gcc_assert (vectype);
4512   mode = TYPE_MODE (vectype);
4513
4514   /* 1. Create the reduction def-use cycle:
4515      Set the arguments of REDUCTION_PHIS, i.e., transform
4516
4517         loop:
4518           vec_def = phi <null, null>            # REDUCTION_PHI
4519           VECT_DEF = vector_stmt                # vectorized form of STMT
4520           ...
4521
4522      into:
4523
4524         loop:
4525           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4526           VECT_DEF = vector_stmt                # vectorized form of STMT
4527           ...
4528
4529      (in case of SLP, do it for all the phis). */
4530
4531   /* Get the loop-entry arguments.  */
4532   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4533   if (slp_node)
4534     {
4535       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4536       vec_initial_defs.reserve (vec_num);
4537       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4538                                       &vec_initial_defs, vec_num,
4539                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4540                                       neutral_op);
4541     }
4542   else
4543     {
4544       /* Get at the scalar def before the loop, that defines the initial value
4545          of the reduction variable.  */
4546       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4547                                            loop_preheader_edge (loop));
4548       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4549          and we can't use zero for induc_val, use initial_def.  Similarly
4550          for REDUC_MIN and initial_def larger than the base.  */
4551       if (TREE_CODE (initial_def) == INTEGER_CST
4552           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4553               == INTEGER_INDUC_COND_REDUCTION)
4554           && !integer_zerop (induc_val)
4555           && ((induc_code == MAX_EXPR
4556                && tree_int_cst_lt (initial_def, induc_val))
4557               || (induc_code == MIN_EXPR
4558                   && tree_int_cst_lt (induc_val, initial_def))))
4559         induc_val = initial_def;
4560
4561       if (double_reduc)
4562         /* In case of double reduction we only create a vector variable
4563            to be put in the reduction phi node.  The actual statement
4564            creation is done later in this function.  */
4565         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4566       else if (nested_in_vect_loop)
4567         {
4568           /* Do not use an adjustment def as that case is not supported
4569              correctly if ncopies is not one.  */
4570           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4571           vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
4572         }
4573       else
4574         vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4575                                                          &adjustment_def);
4576       vec_initial_defs.create (1);
4577       vec_initial_defs.quick_push (vec_initial_def);
4578     }
4579
4580   /* Set phi nodes arguments.  */
4581   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4582     {
4583       tree vec_init_def = vec_initial_defs[i];
4584       tree def = vect_defs[i];
4585       for (j = 0; j < ncopies; j++)
4586         {
4587           if (j != 0)
4588             {
4589               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4590               if (nested_in_vect_loop)
4591                 vec_init_def
4592                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4593                                                     vec_init_def);
4594             }
4595
4596           /* Set the loop-entry arg of the reduction-phi.  */
4597
4598           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4599               == INTEGER_INDUC_COND_REDUCTION)
4600             {
4601               /* Initialise the reduction phi to zero.  This prevents initial
4602                  values of non-zero interferring with the reduction op.  */
4603               gcc_assert (ncopies == 1);
4604               gcc_assert (i == 0);
4605
4606               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4607               tree induc_val_vec
4608                 = build_vector_from_val (vec_init_def_type, induc_val);
4609
4610               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4611                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4612             }
4613           else
4614             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4615                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4616
4617           /* Set the loop-latch arg for the reduction-phi.  */
4618           if (j > 0)
4619             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4620
4621           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4622                        UNKNOWN_LOCATION);
4623
4624           if (dump_enabled_p ())
4625             {
4626               dump_printf_loc (MSG_NOTE, vect_location,
4627                                "transform reduction: created def-use cycle: ");
4628               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4629               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4630             }
4631         }
4632     }
4633
4634   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4635      which is updated with the current index of the loop for every match of
4636      the original loop's cond_expr (VEC_STMT).  This results in a vector
4637      containing the last time the condition passed for that vector lane.
4638      The first match will be a 1 to allow 0 to be used for non-matching
4639      indexes.  If there are no matches at all then the vector will be all
4640      zeroes.  */
4641   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4642     {
4643       tree indx_before_incr, indx_after_incr;
4644       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4645
4646       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4647       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4648
4649       int scalar_precision
4650         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4651       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4652       tree cr_index_vector_type = build_vector_type
4653         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4654
4655       /* First we create a simple vector induction variable which starts
4656          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4657          vector size (STEP).  */
4658
4659       /* Create a {1,2,3,...} vector.  */
4660       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4661
4662       /* Create a vector of the step value.  */
4663       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4664       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4665
4666       /* Create an induction variable.  */
4667       gimple_stmt_iterator incr_gsi;
4668       bool insert_after;
4669       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4670       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4671                  insert_after, &indx_before_incr, &indx_after_incr);
4672
4673       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4674          filled with zeros (VEC_ZERO).  */
4675
4676       /* Create a vector of 0s.  */
4677       tree zero = build_zero_cst (cr_index_scalar_type);
4678       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4679
4680       /* Create a vector phi node.  */
4681       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4682       new_phi = create_phi_node (new_phi_tree, loop->header);
4683       set_vinfo_for_stmt (new_phi,
4684                           new_stmt_vec_info (new_phi, loop_vinfo));
4685       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4686                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4687
4688       /* Now take the condition from the loops original cond_expr
4689          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4690          every match uses values from the induction variable
4691          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4692          (NEW_PHI_TREE).
4693          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4694          the new cond_expr (INDEX_COND_EXPR).  */
4695
4696       /* Duplicate the condition from vec_stmt.  */
4697       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4698
4699       /* Create a conditional, where the condition is taken from vec_stmt
4700          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4701          else is the phi (NEW_PHI_TREE).  */
4702       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4703                                      ccompare, indx_before_incr,
4704                                      new_phi_tree);
4705       induction_index = make_ssa_name (cr_index_vector_type);
4706       gimple *index_condition = gimple_build_assign (induction_index,
4707                                                      index_cond_expr);
4708       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4709       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4710                                                         loop_vinfo);
4711       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4712       set_vinfo_for_stmt (index_condition, index_vec_info);
4713
4714       /* Update the phi with the vec cond.  */
4715       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4716                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4717     }
4718
4719   /* 2. Create epilog code.
4720         The reduction epilog code operates across the elements of the vector
4721         of partial results computed by the vectorized loop.
4722         The reduction epilog code consists of:
4723
4724         step 1: compute the scalar result in a vector (v_out2)
4725         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4726         step 3: adjust the scalar result (s_out3) if needed.
4727
4728         Step 1 can be accomplished using one the following three schemes:
4729           (scheme 1) using reduc_fn, if available.
4730           (scheme 2) using whole-vector shifts, if available.
4731           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4732                      combined.
4733
4734           The overall epilog code looks like this:
4735
4736           s_out0 = phi <s_loop>         # original EXIT_PHI
4737           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4738           v_out2 = reduce <v_out1>              # step 1
4739           s_out3 = extract_field <v_out2, 0>    # step 2
4740           s_out4 = adjust_result <s_out3>       # step 3
4741
4742           (step 3 is optional, and steps 1 and 2 may be combined).
4743           Lastly, the uses of s_out0 are replaced by s_out4.  */
4744
4745
4746   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4747          v_out1 = phi <VECT_DEF>
4748          Store them in NEW_PHIS.  */
4749
4750   exit_bb = single_exit (loop)->dest;
4751   prev_phi_info = NULL;
4752   new_phis.create (vect_defs.length ());
4753   FOR_EACH_VEC_ELT (vect_defs, i, def)
4754     {
4755       for (j = 0; j < ncopies; j++)
4756         {
4757           tree new_def = copy_ssa_name (def);
4758           phi = create_phi_node (new_def, exit_bb);
4759           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4760           if (j == 0)
4761             new_phis.quick_push (phi);
4762           else
4763             {
4764               def = vect_get_vec_def_for_stmt_copy (dt, def);
4765               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4766             }
4767
4768           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4769           prev_phi_info = vinfo_for_stmt (phi);
4770         }
4771     }
4772
4773   /* The epilogue is created for the outer-loop, i.e., for the loop being
4774      vectorized.  Create exit phis for the outer loop.  */
4775   if (double_reduc)
4776     {
4777       loop = outer_loop;
4778       exit_bb = single_exit (loop)->dest;
4779       inner_phis.create (vect_defs.length ());
4780       FOR_EACH_VEC_ELT (new_phis, i, phi)
4781         {
4782           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4783           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4784           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4785                            PHI_RESULT (phi));
4786           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4787                                                             loop_vinfo));
4788           inner_phis.quick_push (phi);
4789           new_phis[i] = outer_phi;
4790           prev_phi_info = vinfo_for_stmt (outer_phi);
4791           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4792             {
4793               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4794               new_result = copy_ssa_name (PHI_RESULT (phi));
4795               outer_phi = create_phi_node (new_result, exit_bb);
4796               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4797                                PHI_RESULT (phi));
4798               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4799                                                                 loop_vinfo));
4800               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4801               prev_phi_info = vinfo_for_stmt (outer_phi);
4802             }
4803         }
4804     }
4805
4806   exit_gsi = gsi_after_labels (exit_bb);
4807
4808   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4809          (i.e. when reduc_fn is not available) and in the final adjustment
4810          code (if needed).  Also get the original scalar reduction variable as
4811          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4812          represents a reduction pattern), the tree-code and scalar-def are
4813          taken from the original stmt that the pattern-stmt (STMT) replaces.
4814          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4815          are taken from STMT.  */
4816
4817   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4818   if (!orig_stmt)
4819     {
4820       /* Regular reduction  */
4821       orig_stmt = stmt;
4822     }
4823   else
4824     {
4825       /* Reduction pattern  */
4826       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4827       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4828       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4829     }
4830
4831   code = gimple_assign_rhs_code (orig_stmt);
4832   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4833      partial results are added and not subtracted.  */
4834   if (code == MINUS_EXPR)
4835     code = PLUS_EXPR;
4836
4837   scalar_dest = gimple_assign_lhs (orig_stmt);
4838   scalar_type = TREE_TYPE (scalar_dest);
4839   scalar_results.create (group_size);
4840   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4841   bitsize = TYPE_SIZE (scalar_type);
4842
4843   /* In case this is a reduction in an inner-loop while vectorizing an outer
4844      loop - we don't need to extract a single scalar result at the end of the
4845      inner-loop (unless it is double reduction, i.e., the use of reduction is
4846      outside the outer-loop).  The final vector of partial results will be used
4847      in the vectorized outer-loop, or reduced to a scalar result at the end of
4848      the outer-loop.  */
4849   if (nested_in_vect_loop && !double_reduc)
4850     goto vect_finalize_reduction;
4851
4852   /* SLP reduction without reduction chain, e.g.,
4853      # a1 = phi <a2, a0>
4854      # b1 = phi <b2, b0>
4855      a2 = operation (a1)
4856      b2 = operation (b1)  */
4857   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4858
4859   /* True if we should implement SLP_REDUC using native reduction operations
4860      instead of scalar operations.  */
4861   direct_slp_reduc = (reduc_fn != IFN_LAST
4862                       && slp_reduc
4863                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4864
4865   /* In case of reduction chain, e.g.,
4866      # a1 = phi <a3, a0>
4867      a2 = operation (a1)
4868      a3 = operation (a2),
4869
4870      we may end up with more than one vector result.  Here we reduce them to
4871      one vector.  */
4872   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4873     {
4874       tree first_vect = PHI_RESULT (new_phis[0]);
4875       gassign *new_vec_stmt = NULL;
4876       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4877       for (k = 1; k < new_phis.length (); k++)
4878         {
4879           gimple *next_phi = new_phis[k];
4880           tree second_vect = PHI_RESULT (next_phi);
4881           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4882           new_vec_stmt = gimple_build_assign (tem, code,
4883                                               first_vect, second_vect);
4884           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4885           first_vect = tem;
4886         }
4887
4888       new_phi_result = first_vect;
4889       if (new_vec_stmt)
4890         {
4891           new_phis.truncate (0);
4892           new_phis.safe_push (new_vec_stmt);
4893         }
4894     }
4895   /* Likewise if we couldn't use a single defuse cycle.  */
4896   else if (ncopies > 1)
4897     {
4898       gcc_assert (new_phis.length () == 1);
4899       tree first_vect = PHI_RESULT (new_phis[0]);
4900       gassign *new_vec_stmt = NULL;
4901       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4902       gimple *next_phi = new_phis[0];
4903       for (int k = 1; k < ncopies; ++k)
4904         {
4905           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4906           tree second_vect = PHI_RESULT (next_phi);
4907           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4908           new_vec_stmt = gimple_build_assign (tem, code,
4909                                               first_vect, second_vect);
4910           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4911           first_vect = tem;
4912         }
4913       new_phi_result = first_vect;
4914       new_phis.truncate (0);
4915       new_phis.safe_push (new_vec_stmt);
4916     }
4917   else
4918     new_phi_result = PHI_RESULT (new_phis[0]);
4919
4920   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4921       && reduc_fn != IFN_LAST)
4922     {
4923       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4924          various data values where the condition matched and another vector
4925          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4926          need to extract the last matching index (which will be the index with
4927          highest value) and use this to index into the data vector.
4928          For the case where there were no matches, the data vector will contain
4929          all default values and the index vector will be all zeros.  */
4930
4931       /* Get various versions of the type of the vector of indexes.  */
4932       tree index_vec_type = TREE_TYPE (induction_index);
4933       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4934       tree index_scalar_type = TREE_TYPE (index_vec_type);
4935       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4936         (index_vec_type);
4937
4938       /* Get an unsigned integer version of the type of the data vector.  */
4939       int scalar_precision
4940         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4941       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4942       tree vectype_unsigned = build_vector_type
4943         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4944
4945       /* First we need to create a vector (ZERO_VEC) of zeros and another
4946          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4947          can create using a MAX reduction and then expanding.
4948          In the case where the loop never made any matches, the max index will
4949          be zero.  */
4950
4951       /* Vector of {0, 0, 0,...}.  */
4952       tree zero_vec = make_ssa_name (vectype);
4953       tree zero_vec_rhs = build_zero_cst (vectype);
4954       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4955       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4956
4957       /* Find maximum value from the vector of found indexes.  */
4958       tree max_index = make_ssa_name (index_scalar_type);
4959       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4960                                                           1, induction_index);
4961       gimple_call_set_lhs (max_index_stmt, max_index);
4962       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4963
4964       /* Vector of {max_index, max_index, max_index,...}.  */
4965       tree max_index_vec = make_ssa_name (index_vec_type);
4966       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4967                                                       max_index);
4968       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4969                                                         max_index_vec_rhs);
4970       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4971
4972       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4973          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4974          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4975          otherwise.  Only one value should match, resulting in a vector
4976          (VEC_COND) with one data value and the rest zeros.
4977          In the case where the loop never made any matches, every index will
4978          match, resulting in a vector with all data values (which will all be
4979          the default value).  */
4980
4981       /* Compare the max index vector to the vector of found indexes to find
4982          the position of the max value.  */
4983       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4984       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4985                                                       induction_index,
4986                                                       max_index_vec);
4987       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4988
4989       /* Use the compare to choose either values from the data vector or
4990          zero.  */
4991       tree vec_cond = make_ssa_name (vectype);
4992       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4993                                                    vec_compare, new_phi_result,
4994                                                    zero_vec);
4995       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4996
4997       /* Finally we need to extract the data value from the vector (VEC_COND)
4998          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4999          reduction, but because this doesn't exist, we can use a MAX reduction
5000          instead.  The data value might be signed or a float so we need to cast
5001          it first.
5002          In the case where the loop never made any matches, the data values are
5003          all identical, and so will reduce down correctly.  */
5004
5005       /* Make the matched data values unsigned.  */
5006       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5007       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5008                                        vec_cond);
5009       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5010                                                         VIEW_CONVERT_EXPR,
5011                                                         vec_cond_cast_rhs);
5012       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5013
5014       /* Reduce down to a scalar value.  */
5015       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5016       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5017                                                            1, vec_cond_cast);
5018       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5019       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5020
5021       /* Convert the reduced value back to the result type and set as the
5022          result.  */
5023       gimple_seq stmts = NULL;
5024       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5025                                data_reduc);
5026       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5027       scalar_results.safe_push (new_temp);
5028     }
5029   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5030            && reduc_fn == IFN_LAST)
5031     {
5032       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5033          idx = 0;
5034          idx_val = induction_index[0];
5035          val = data_reduc[0];
5036          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5037            if (induction_index[i] > idx_val)
5038              val = data_reduc[i], idx_val = induction_index[i];
5039          return val;  */
5040
5041       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5042       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5043       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5044       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5045       /* Enforced by vectorizable_reduction, which ensures we have target
5046          support before allowing a conditional reduction on variable-length
5047          vectors.  */
5048       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5049       tree idx_val = NULL_TREE, val = NULL_TREE;
5050       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5051         {
5052           tree old_idx_val = idx_val;
5053           tree old_val = val;
5054           idx_val = make_ssa_name (idx_eltype);
5055           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5056                                              build3 (BIT_FIELD_REF, idx_eltype,
5057                                                      induction_index,
5058                                                      bitsize_int (el_size),
5059                                                      bitsize_int (off)));
5060           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5061           val = make_ssa_name (data_eltype);
5062           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5063                                              build3 (BIT_FIELD_REF,
5064                                                      data_eltype,
5065                                                      new_phi_result,
5066                                                      bitsize_int (el_size),
5067                                                      bitsize_int (off)));
5068           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5069           if (off != 0)
5070             {
5071               tree new_idx_val = idx_val;
5072               tree new_val = val;
5073               if (off != v_size - el_size)
5074                 {
5075                   new_idx_val = make_ssa_name (idx_eltype);
5076                   epilog_stmt = gimple_build_assign (new_idx_val,
5077                                                      MAX_EXPR, idx_val,
5078                                                      old_idx_val);
5079                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5080                 }
5081               new_val = make_ssa_name (data_eltype);
5082               epilog_stmt = gimple_build_assign (new_val,
5083                                                  COND_EXPR,
5084                                                  build2 (GT_EXPR,
5085                                                          boolean_type_node,
5086                                                          idx_val,
5087                                                          old_idx_val),
5088                                                  val, old_val);
5089               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5090               idx_val = new_idx_val;
5091               val = new_val;
5092             }
5093         }
5094       /* Convert the reduced value back to the result type and set as the
5095          result.  */
5096       gimple_seq stmts = NULL;
5097       val = gimple_convert (&stmts, scalar_type, val);
5098       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5099       scalar_results.safe_push (val);
5100     }
5101
5102   /* 2.3 Create the reduction code, using one of the three schemes described
5103          above. In SLP we simply need to extract all the elements from the
5104          vector (without reducing them), so we use scalar shifts.  */
5105   else if (reduc_fn != IFN_LAST && !slp_reduc)
5106     {
5107       tree tmp;
5108       tree vec_elem_type;
5109
5110       /* Case 1:  Create:
5111          v_out2 = reduc_expr <v_out1>  */
5112
5113       if (dump_enabled_p ())
5114         dump_printf_loc (MSG_NOTE, vect_location,
5115                          "Reduce using direct vector reduction.\n");
5116
5117       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5118       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5119         {
5120           tree tmp_dest
5121             = vect_create_destination_var (scalar_dest, vec_elem_type);
5122           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5123                                                     new_phi_result);
5124           gimple_set_lhs (epilog_stmt, tmp_dest);
5125           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5126           gimple_set_lhs (epilog_stmt, new_temp);
5127           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5128
5129           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5130                                              new_temp);
5131         }
5132       else
5133         {
5134           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5135                                                     new_phi_result);
5136           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5137         }
5138
5139       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5140       gimple_set_lhs (epilog_stmt, new_temp);
5141       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5142
5143       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5144            == INTEGER_INDUC_COND_REDUCTION)
5145           && !operand_equal_p (initial_def, induc_val, 0))
5146         {
5147           /* Earlier we set the initial value to be a vector if induc_val
5148              values.  Check the result and if it is induc_val then replace
5149              with the original initial value, unless induc_val is
5150              the same as initial_def already.  */
5151           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5152                                   induc_val);
5153
5154           tmp = make_ssa_name (new_scalar_dest);
5155           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5156                                              initial_def, new_temp);
5157           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5158           new_temp = tmp;
5159         }
5160
5161       scalar_results.safe_push (new_temp);
5162     }
5163   else if (direct_slp_reduc)
5164     {
5165       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5166          with the elements for other SLP statements replaced with the
5167          neutral value.  We can then do a normal reduction on each vector.  */
5168
5169       /* Enforced by vectorizable_reduction.  */
5170       gcc_assert (new_phis.length () == 1);
5171       gcc_assert (pow2p_hwi (group_size));
5172
5173       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5174       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5175       gimple_seq seq = NULL;
5176
5177       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5178          and the same element size as VECTYPE.  */
5179       tree index = build_index_vector (vectype, 0, 1);
5180       tree index_type = TREE_TYPE (index);
5181       tree index_elt_type = TREE_TYPE (index_type);
5182       tree mask_type = build_same_sized_truth_vector_type (index_type);
5183
5184       /* Create a vector that, for each element, identifies which of
5185          the REDUC_GROUP_SIZE results should use it.  */
5186       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5187       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5188                             build_vector_from_val (index_type, index_mask));
5189
5190       /* Get a neutral vector value.  This is simply a splat of the neutral
5191          scalar value if we have one, otherwise the initial scalar value
5192          is itself a neutral value.  */
5193       tree vector_identity = NULL_TREE;
5194       if (neutral_op)
5195         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5196                                                         neutral_op);
5197       for (unsigned int i = 0; i < group_size; ++i)
5198         {
5199           /* If there's no univeral neutral value, we can use the
5200              initial scalar value from the original PHI.  This is used
5201              for MIN and MAX reduction, for example.  */
5202           if (!neutral_op)
5203             {
5204               tree scalar_value
5205                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5206                                          loop_preheader_edge (loop));
5207               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5208                                                               scalar_value);
5209             }
5210
5211           /* Calculate the equivalent of:
5212
5213              sel[j] = (index[j] == i);
5214
5215              which selects the elements of NEW_PHI_RESULT that should
5216              be included in the result.  */
5217           tree compare_val = build_int_cst (index_elt_type, i);
5218           compare_val = build_vector_from_val (index_type, compare_val);
5219           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5220                                    index, compare_val);
5221
5222           /* Calculate the equivalent of:
5223
5224              vec = seq ? new_phi_result : vector_identity;
5225
5226              VEC is now suitable for a full vector reduction.  */
5227           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5228                                    sel, new_phi_result, vector_identity);
5229
5230           /* Do the reduction and convert it to the appropriate type.  */
5231           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5232                                       TREE_TYPE (vectype), vec);
5233           scalar = gimple_convert (&seq, scalar_type, scalar);
5234           scalar_results.safe_push (scalar);
5235         }
5236       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5237     }
5238   else
5239     {
5240       bool reduce_with_shift;
5241       tree vec_temp;
5242
5243       /* COND reductions all do the final reduction with MAX_EXPR
5244          or MIN_EXPR.  */
5245       if (code == COND_EXPR)
5246         {
5247           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5248               == INTEGER_INDUC_COND_REDUCTION)
5249             code = induc_code;
5250           else
5251             code = MAX_EXPR;
5252         }
5253
5254       /* See if the target wants to do the final (shift) reduction
5255          in a vector mode of smaller size and first reduce upper/lower
5256          halves against each other.  */
5257       enum machine_mode mode1 = mode;
5258       tree vectype1 = vectype;
5259       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5260       unsigned sz1 = sz;
5261       if (!slp_reduc
5262           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5263         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5264
5265       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5266       reduce_with_shift = have_whole_vector_shift (mode1);
5267       if (!VECTOR_MODE_P (mode1))
5268         reduce_with_shift = false;
5269       else
5270         {
5271           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5272           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5273             reduce_with_shift = false;
5274         }
5275
5276       /* First reduce the vector to the desired vector size we should
5277          do shift reduction on by combining upper and lower halves.  */
5278       new_temp = new_phi_result;
5279       while (sz > sz1)
5280         {
5281           gcc_assert (!slp_reduc);
5282           sz /= 2;
5283           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5284
5285           /* The target has to make sure we support lowpart/highpart
5286              extraction, either via direct vector extract or through
5287              an integer mode punning.  */
5288           tree dst1, dst2;
5289           if (convert_optab_handler (vec_extract_optab,
5290                                      TYPE_MODE (TREE_TYPE (new_temp)),
5291                                      TYPE_MODE (vectype1))
5292               != CODE_FOR_nothing)
5293             {
5294               /* Extract sub-vectors directly once vec_extract becomes
5295                  a conversion optab.  */
5296               dst1 = make_ssa_name (vectype1);
5297               epilog_stmt
5298                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5299                                          build3 (BIT_FIELD_REF, vectype1,
5300                                                  new_temp, TYPE_SIZE (vectype1),
5301                                                  bitsize_int (0)));
5302               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5303               dst2 =  make_ssa_name (vectype1);
5304               epilog_stmt
5305                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5306                                          build3 (BIT_FIELD_REF, vectype1,
5307                                                  new_temp, TYPE_SIZE (vectype1),
5308                                                  bitsize_int (sz * BITS_PER_UNIT)));
5309               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5310             }
5311           else
5312             {
5313               /* Extract via punning to appropriately sized integer mode
5314                  vector.  */
5315               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5316                                                             1);
5317               tree etype = build_vector_type (eltype, 2);
5318               gcc_assert (convert_optab_handler (vec_extract_optab,
5319                                                  TYPE_MODE (etype),
5320                                                  TYPE_MODE (eltype))
5321                           != CODE_FOR_nothing);
5322               tree tem = make_ssa_name (etype);
5323               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5324                                                  build1 (VIEW_CONVERT_EXPR,
5325                                                          etype, new_temp));
5326               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5327               new_temp = tem;
5328               tem = make_ssa_name (eltype);
5329               epilog_stmt
5330                   = gimple_build_assign (tem, BIT_FIELD_REF,
5331                                          build3 (BIT_FIELD_REF, eltype,
5332                                                  new_temp, TYPE_SIZE (eltype),
5333                                                  bitsize_int (0)));
5334               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5335               dst1 = make_ssa_name (vectype1);
5336               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5337                                                  build1 (VIEW_CONVERT_EXPR,
5338                                                          vectype1, tem));
5339               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5340               tem = make_ssa_name (eltype);
5341               epilog_stmt
5342                   = gimple_build_assign (tem, BIT_FIELD_REF,
5343                                          build3 (BIT_FIELD_REF, eltype,
5344                                                  new_temp, TYPE_SIZE (eltype),
5345                                                  bitsize_int (sz * BITS_PER_UNIT)));
5346               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5347               dst2 =  make_ssa_name (vectype1);
5348               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5349                                                  build1 (VIEW_CONVERT_EXPR,
5350                                                          vectype1, tem));
5351               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5352             }
5353
5354           new_temp = make_ssa_name (vectype1);
5355           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5356           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5357         }
5358
5359       if (reduce_with_shift && !slp_reduc)
5360         {
5361           int element_bitsize = tree_to_uhwi (bitsize);
5362           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5363              for variable-length vectors and also requires direct target support
5364              for loop reductions.  */
5365           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5366           int nelements = vec_size_in_bits / element_bitsize;
5367           vec_perm_builder sel;
5368           vec_perm_indices indices;
5369
5370           int elt_offset;
5371
5372           tree zero_vec = build_zero_cst (vectype1);
5373           /* Case 2: Create:
5374              for (offset = nelements/2; offset >= 1; offset/=2)
5375                 {
5376                   Create:  va' = vec_shift <va, offset>
5377                   Create:  va = vop <va, va'>
5378                 }  */
5379
5380           tree rhs;
5381
5382           if (dump_enabled_p ())
5383             dump_printf_loc (MSG_NOTE, vect_location,
5384                              "Reduce using vector shifts\n");
5385
5386           mode1 = TYPE_MODE (vectype1);
5387           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5388           for (elt_offset = nelements / 2;
5389                elt_offset >= 1;
5390                elt_offset /= 2)
5391             {
5392               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5393               indices.new_vector (sel, 2, nelements);
5394               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5395               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5396                                                  new_temp, zero_vec, mask);
5397               new_name = make_ssa_name (vec_dest, epilog_stmt);
5398               gimple_assign_set_lhs (epilog_stmt, new_name);
5399               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5400
5401               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5402                                                  new_temp);
5403               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5404               gimple_assign_set_lhs (epilog_stmt, new_temp);
5405               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5406             }
5407
5408           /* 2.4  Extract the final scalar result.  Create:
5409              s_out3 = extract_field <v_out2, bitpos>  */
5410
5411           if (dump_enabled_p ())
5412             dump_printf_loc (MSG_NOTE, vect_location,
5413                              "extract scalar result\n");
5414
5415           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5416                         bitsize, bitsize_zero_node);
5417           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5418           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5419           gimple_assign_set_lhs (epilog_stmt, new_temp);
5420           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5421           scalar_results.safe_push (new_temp);
5422         }
5423       else
5424         {
5425           /* Case 3: Create:
5426              s = extract_field <v_out2, 0>
5427              for (offset = element_size;
5428                   offset < vector_size;
5429                   offset += element_size;)
5430                {
5431                  Create:  s' = extract_field <v_out2, offset>
5432                  Create:  s = op <s, s'>  // For non SLP cases
5433                }  */
5434
5435           if (dump_enabled_p ())
5436             dump_printf_loc (MSG_NOTE, vect_location,
5437                              "Reduce using scalar code.\n");
5438
5439           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5440           int element_bitsize = tree_to_uhwi (bitsize);
5441           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5442             {
5443               int bit_offset;
5444               if (gimple_code (new_phi) == GIMPLE_PHI)
5445                 vec_temp = PHI_RESULT (new_phi);
5446               else
5447                 vec_temp = gimple_assign_lhs (new_phi);
5448               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5449                                  bitsize_zero_node);
5450               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5451               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5452               gimple_assign_set_lhs (epilog_stmt, new_temp);
5453               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5454
5455               /* In SLP we don't need to apply reduction operation, so we just
5456                  collect s' values in SCALAR_RESULTS.  */
5457               if (slp_reduc)
5458                 scalar_results.safe_push (new_temp);
5459
5460               for (bit_offset = element_bitsize;
5461                    bit_offset < vec_size_in_bits;
5462                    bit_offset += element_bitsize)
5463                 {
5464                   tree bitpos = bitsize_int (bit_offset);
5465                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5466                                      bitsize, bitpos);
5467
5468                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5469                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5470                   gimple_assign_set_lhs (epilog_stmt, new_name);
5471                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5472
5473                   if (slp_reduc)
5474                     {
5475                       /* In SLP we don't need to apply reduction operation, so
5476                          we just collect s' values in SCALAR_RESULTS.  */
5477                       new_temp = new_name;
5478                       scalar_results.safe_push (new_name);
5479                     }
5480                   else
5481                     {
5482                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5483                                                          new_name, new_temp);
5484                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5485                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5486                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5487                     }
5488                 }
5489             }
5490
5491           /* The only case where we need to reduce scalar results in SLP, is
5492              unrolling.  If the size of SCALAR_RESULTS is greater than
5493              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5494              REDUC_GROUP_SIZE.  */
5495           if (slp_reduc)
5496             {
5497               tree res, first_res, new_res;
5498               gimple *new_stmt;
5499
5500               /* Reduce multiple scalar results in case of SLP unrolling.  */
5501               for (j = group_size; scalar_results.iterate (j, &res);
5502                    j++)
5503                 {
5504                   first_res = scalar_results[j % group_size];
5505                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5506                                                   first_res, res);
5507                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5508                   gimple_assign_set_lhs (new_stmt, new_res);
5509                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5510                   scalar_results[j % group_size] = new_res;
5511                 }
5512             }
5513           else
5514             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5515             scalar_results.safe_push (new_temp);
5516         }
5517
5518       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5519            == INTEGER_INDUC_COND_REDUCTION)
5520           && !operand_equal_p (initial_def, induc_val, 0))
5521         {
5522           /* Earlier we set the initial value to be a vector if induc_val
5523              values.  Check the result and if it is induc_val then replace
5524              with the original initial value, unless induc_val is
5525              the same as initial_def already.  */
5526           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5527                                   induc_val);
5528
5529           tree tmp = make_ssa_name (new_scalar_dest);
5530           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5531                                              initial_def, new_temp);
5532           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5533           scalar_results[0] = tmp;
5534         }
5535     }
5536
5537 vect_finalize_reduction:
5538
5539   if (double_reduc)
5540     loop = loop->inner;
5541
5542   /* 2.5 Adjust the final result by the initial value of the reduction
5543          variable. (When such adjustment is not needed, then
5544          'adjustment_def' is zero).  For example, if code is PLUS we create:
5545          new_temp = loop_exit_def + adjustment_def  */
5546
5547   if (adjustment_def)
5548     {
5549       gcc_assert (!slp_reduc);
5550       if (nested_in_vect_loop)
5551         {
5552           new_phi = new_phis[0];
5553           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5554           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5555           new_dest = vect_create_destination_var (scalar_dest, vectype);
5556         }
5557       else
5558         {
5559           new_temp = scalar_results[0];
5560           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5561           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5562           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5563         }
5564
5565       epilog_stmt = gimple_build_assign (new_dest, expr);
5566       new_temp = make_ssa_name (new_dest, epilog_stmt);
5567       gimple_assign_set_lhs (epilog_stmt, new_temp);
5568       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5569       if (nested_in_vect_loop)
5570         {
5571           set_vinfo_for_stmt (epilog_stmt,
5572                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5573           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5574                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5575
5576           if (!double_reduc)
5577             scalar_results.quick_push (new_temp);
5578           else
5579             scalar_results[0] = new_temp;
5580         }
5581       else
5582         scalar_results[0] = new_temp;
5583
5584       new_phis[0] = epilog_stmt;
5585     }
5586
5587   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5588           phis with new adjusted scalar results, i.e., replace use <s_out0>
5589           with use <s_out4>.
5590
5591      Transform:
5592         loop_exit:
5593           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5594           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5595           v_out2 = reduce <v_out1>
5596           s_out3 = extract_field <v_out2, 0>
5597           s_out4 = adjust_result <s_out3>
5598           use <s_out0>
5599           use <s_out0>
5600
5601      into:
5602
5603         loop_exit:
5604           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5605           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5606           v_out2 = reduce <v_out1>
5607           s_out3 = extract_field <v_out2, 0>
5608           s_out4 = adjust_result <s_out3>
5609           use <s_out4>
5610           use <s_out4> */
5611
5612
5613   /* In SLP reduction chain we reduce vector results into one vector if
5614      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5615      LHS of the last stmt in the reduction chain, since we are looking for
5616      the loop exit phi node.  */
5617   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5618     {
5619       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5620       /* Handle reduction patterns.  */
5621       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5622         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5623
5624       scalar_dest = gimple_assign_lhs (dest_stmt);
5625       group_size = 1;
5626     }
5627
5628   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5629      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5630      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5631      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5632      correspond to the first vector stmt, etc.
5633      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5634   if (group_size > new_phis.length ())
5635     {
5636       ratio = group_size / new_phis.length ();
5637       gcc_assert (!(group_size % new_phis.length ()));
5638     }
5639   else
5640     ratio = 1;
5641
5642   for (k = 0; k < group_size; k++)
5643     {
5644       if (k % ratio == 0)
5645         {
5646           epilog_stmt = new_phis[k / ratio];
5647           reduction_phi = reduction_phis[k / ratio];
5648           if (double_reduc)
5649             inner_phi = inner_phis[k / ratio];
5650         }
5651
5652       if (slp_reduc)
5653         {
5654           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5655
5656           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5657           /* SLP statements can't participate in patterns.  */
5658           gcc_assert (!orig_stmt);
5659           scalar_dest = gimple_assign_lhs (current_stmt);
5660         }
5661
5662       phis.create (3);
5663       /* Find the loop-closed-use at the loop exit of the original scalar
5664          result.  (The reduction result is expected to have two immediate uses -
5665          one at the latch block, and one at the loop exit).  */
5666       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5667         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5668             && !is_gimple_debug (USE_STMT (use_p)))
5669           phis.safe_push (USE_STMT (use_p));
5670
5671       /* While we expect to have found an exit_phi because of loop-closed-ssa
5672          form we can end up without one if the scalar cycle is dead.  */
5673
5674       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5675         {
5676           if (outer_loop)
5677             {
5678               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5679               gphi *vect_phi;
5680
5681               /* FORNOW. Currently not supporting the case that an inner-loop
5682                  reduction is not used in the outer-loop (but only outside the
5683                  outer-loop), unless it is double reduction.  */
5684               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5685                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5686                           || double_reduc);
5687
5688               if (double_reduc)
5689                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5690               else
5691                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5692               if (!double_reduc
5693                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5694                       != vect_double_reduction_def)
5695                 continue;
5696
5697               /* Handle double reduction:
5698
5699                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5700                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5701                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5702                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5703
5704                  At that point the regular reduction (stmt2 and stmt3) is
5705                  already vectorized, as well as the exit phi node, stmt4.
5706                  Here we vectorize the phi node of double reduction, stmt1, and
5707                  update all relevant statements.  */
5708
5709               /* Go through all the uses of s2 to find double reduction phi
5710                  node, i.e., stmt1 above.  */
5711               orig_name = PHI_RESULT (exit_phi);
5712               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5713                 {
5714                   stmt_vec_info use_stmt_vinfo;
5715                   stmt_vec_info new_phi_vinfo;
5716                   tree vect_phi_init, preheader_arg, vect_phi_res;
5717                   basic_block bb = gimple_bb (use_stmt);
5718                   gimple *use;
5719
5720                   /* Check that USE_STMT is really double reduction phi
5721                      node.  */
5722                   if (gimple_code (use_stmt) != GIMPLE_PHI
5723                       || gimple_phi_num_args (use_stmt) != 2
5724                       || bb->loop_father != outer_loop)
5725                     continue;
5726                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5727                   if (!use_stmt_vinfo
5728                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5729                           != vect_double_reduction_def)
5730                     continue;
5731
5732                   /* Create vector phi node for double reduction:
5733                      vs1 = phi <vs0, vs2>
5734                      vs1 was created previously in this function by a call to
5735                        vect_get_vec_def_for_operand and is stored in
5736                        vec_initial_def;
5737                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5738                      vs0 is created here.  */
5739
5740                   /* Create vector phi node.  */
5741                   vect_phi = create_phi_node (vec_initial_def, bb);
5742                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5743                                     loop_vec_info_for_loop (outer_loop));
5744                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5745
5746                   /* Create vs0 - initial def of the double reduction phi.  */
5747                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5748                                              loop_preheader_edge (outer_loop));
5749                   vect_phi_init = get_initial_def_for_reduction
5750                     (stmt, preheader_arg, NULL);
5751
5752                   /* Update phi node arguments with vs0 and vs2.  */
5753                   add_phi_arg (vect_phi, vect_phi_init,
5754                                loop_preheader_edge (outer_loop),
5755                                UNKNOWN_LOCATION);
5756                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5757                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5758                   if (dump_enabled_p ())
5759                     {
5760                       dump_printf_loc (MSG_NOTE, vect_location,
5761                                        "created double reduction phi node: ");
5762                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5763                     }
5764
5765                   vect_phi_res = PHI_RESULT (vect_phi);
5766
5767                   /* Replace the use, i.e., set the correct vs1 in the regular
5768                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5769                      loop is redundant.  */
5770                   use = reduction_phi;
5771                   for (j = 0; j < ncopies; j++)
5772                     {
5773                       edge pr_edge = loop_preheader_edge (loop);
5774                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5775                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5776                     }
5777                 }
5778             }
5779         }
5780
5781       phis.release ();
5782       if (nested_in_vect_loop)
5783         {
5784           if (double_reduc)
5785             loop = outer_loop;
5786           else
5787             continue;
5788         }
5789
5790       phis.create (3);
5791       /* Find the loop-closed-use at the loop exit of the original scalar
5792          result.  (The reduction result is expected to have two immediate uses,
5793          one at the latch block, and one at the loop exit).  For double
5794          reductions we are looking for exit phis of the outer loop.  */
5795       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5796         {
5797           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5798             {
5799               if (!is_gimple_debug (USE_STMT (use_p)))
5800                 phis.safe_push (USE_STMT (use_p));
5801             }
5802           else
5803             {
5804               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5805                 {
5806                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5807
5808                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5809                     {
5810                       if (!flow_bb_inside_loop_p (loop,
5811                                              gimple_bb (USE_STMT (phi_use_p)))
5812                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5813                         phis.safe_push (USE_STMT (phi_use_p));
5814                     }
5815                 }
5816             }
5817         }
5818
5819       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5820         {
5821           /* Replace the uses:  */
5822           orig_name = PHI_RESULT (exit_phi);
5823           scalar_result = scalar_results[k];
5824           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5825             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5826               SET_USE (use_p, scalar_result);
5827         }
5828
5829       phis.release ();
5830     }
5831 }
5832
5833 /* Return a vector of type VECTYPE that is equal to the vector select
5834    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5835    before GSI.  */
5836
5837 static tree
5838 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5839                      tree vec, tree identity)
5840 {
5841   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5842   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5843                                           mask, vec, identity);
5844   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5845   return cond;
5846 }
5847
5848 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5849    order, starting with LHS.  Insert the extraction statements before GSI and
5850    associate the new scalar SSA names with variable SCALAR_DEST.
5851    Return the SSA name for the result.  */
5852
5853 static tree
5854 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5855                        tree_code code, tree lhs, tree vector_rhs)
5856 {
5857   tree vectype = TREE_TYPE (vector_rhs);
5858   tree scalar_type = TREE_TYPE (vectype);
5859   tree bitsize = TYPE_SIZE (scalar_type);
5860   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5861   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5862
5863   for (unsigned HOST_WIDE_INT bit_offset = 0;
5864        bit_offset < vec_size_in_bits;
5865        bit_offset += element_bitsize)
5866     {
5867       tree bitpos = bitsize_int (bit_offset);
5868       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5869                          bitsize, bitpos);
5870
5871       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5872       rhs = make_ssa_name (scalar_dest, stmt);
5873       gimple_assign_set_lhs (stmt, rhs);
5874       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5875
5876       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5877       tree new_name = make_ssa_name (scalar_dest, stmt);
5878       gimple_assign_set_lhs (stmt, new_name);
5879       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5880       lhs = new_name;
5881     }
5882   return lhs;
5883 }
5884
5885 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5886    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5887    statement.  CODE is the operation performed by STMT and OPS are
5888    its scalar operands.  REDUC_INDEX is the index of the operand in
5889    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5890    implements in-order reduction, or IFN_LAST if we should open-code it.
5891    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5892    that should be used to control the operation in a fully-masked loop.  */
5893
5894 static bool
5895 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5896                                gimple **vec_stmt, slp_tree slp_node,
5897                                gimple *reduc_def_stmt,
5898                                tree_code code, internal_fn reduc_fn,
5899                                tree ops[3], tree vectype_in,
5900                                int reduc_index, vec_loop_masks *masks)
5901 {
5902   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5903   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5904   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5905   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5906   gimple *new_stmt = NULL;
5907
5908   int ncopies;
5909   if (slp_node)
5910     ncopies = 1;
5911   else
5912     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5913
5914   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5915   gcc_assert (ncopies == 1);
5916   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5917   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5918   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5919               == FOLD_LEFT_REDUCTION);
5920
5921   if (slp_node)
5922     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5923                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5924
5925   tree op0 = ops[1 - reduc_index];
5926
5927   int group_size = 1;
5928   gimple *scalar_dest_def;
5929   auto_vec<tree> vec_oprnds0;
5930   if (slp_node)
5931     {
5932       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5933       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5934       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5935     }
5936   else
5937     {
5938       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5939       vec_oprnds0.create (1);
5940       vec_oprnds0.quick_push (loop_vec_def0);
5941       scalar_dest_def = stmt;
5942     }
5943
5944   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5945   tree scalar_type = TREE_TYPE (scalar_dest);
5946   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5947
5948   int vec_num = vec_oprnds0.length ();
5949   gcc_assert (vec_num == 1 || slp_node);
5950   tree vec_elem_type = TREE_TYPE (vectype_out);
5951   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5952
5953   tree vector_identity = NULL_TREE;
5954   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5955     vector_identity = build_zero_cst (vectype_out);
5956
5957   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5958   int i;
5959   tree def0;
5960   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5961     {
5962       tree mask = NULL_TREE;
5963       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5964         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5965
5966       /* Handle MINUS by adding the negative.  */
5967       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5968         {
5969           tree negated = make_ssa_name (vectype_out);
5970           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5971           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5972           def0 = negated;
5973         }
5974
5975       if (mask)
5976         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5977                                     vector_identity);
5978
5979       /* On the first iteration the input is simply the scalar phi
5980          result, and for subsequent iterations it is the output of
5981          the preceding operation.  */
5982       if (reduc_fn != IFN_LAST)
5983         {
5984           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5985           /* For chained SLP reductions the output of the previous reduction
5986              operation serves as the input of the next. For the final statement
5987              the output cannot be a temporary - we reuse the original
5988              scalar destination of the last statement.  */
5989           if (i != vec_num - 1)
5990             {
5991               gimple_set_lhs (new_stmt, scalar_dest_var);
5992               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5993               gimple_set_lhs (new_stmt, reduc_var);
5994             }
5995         }
5996       else
5997         {
5998           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5999                                              reduc_var, def0);
6000           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6001           /* Remove the statement, so that we can use the same code paths
6002              as for statements that we've just created.  */
6003           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6004           gsi_remove (&tmp_gsi, false);
6005         }
6006
6007       if (i == vec_num - 1)
6008         {
6009           gimple_set_lhs (new_stmt, scalar_dest);
6010           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6011         }
6012       else
6013         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6014
6015       if (slp_node)
6016         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6017     }
6018
6019   if (!slp_node)
6020     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6021
6022   return true;
6023 }
6024
6025 /* Function is_nonwrapping_integer_induction.
6026
6027    Check if STMT (which is part of loop LOOP) both increments and
6028    does not cause overflow.  */
6029
6030 static bool
6031 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6032 {
6033   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6034   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6035   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6036   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6037   widest_int ni, max_loop_value, lhs_max;
6038   wi::overflow_type overflow = wi::OVF_NONE;
6039
6040   /* Make sure the loop is integer based.  */
6041   if (TREE_CODE (base) != INTEGER_CST
6042       || TREE_CODE (step) != INTEGER_CST)
6043     return false;
6044
6045   /* Check that the max size of the loop will not wrap.  */
6046
6047   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6048     return true;
6049
6050   if (! max_stmt_executions (loop, &ni))
6051     return false;
6052
6053   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6054                             &overflow);
6055   if (overflow)
6056     return false;
6057
6058   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6059                             TYPE_SIGN (lhs_type), &overflow);
6060   if (overflow)
6061     return false;
6062
6063   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6064           <= TYPE_PRECISION (lhs_type));
6065 }
6066
6067 /* Function vectorizable_reduction.
6068
6069    Check if STMT performs a reduction operation that can be vectorized.
6070    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6071    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6072    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6073
6074    This function also handles reduction idioms (patterns) that have been
6075    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6076    of this form:
6077      X = pattern_expr (arg0, arg1, ..., X)
6078    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6079    sequence that had been detected and replaced by the pattern-stmt (STMT).
6080
6081    This function also handles reduction of condition expressions, for example:
6082      for (int i = 0; i < N; i++)
6083        if (a[i] < value)
6084          last = a[i];
6085    This is handled by vectorising the loop and creating an additional vector
6086    containing the loop indexes for which "a[i] < value" was true.  In the
6087    function epilogue this is reduced to a single max value and then used to
6088    index into the vector of results.
6089
6090    In some cases of reduction patterns, the type of the reduction variable X is
6091    different than the type of the other arguments of STMT.
6092    In such cases, the vectype that is used when transforming STMT into a vector
6093    stmt is different than the vectype that is used to determine the
6094    vectorization factor, because it consists of a different number of elements
6095    than the actual number of elements that are being operated upon in parallel.
6096
6097    For example, consider an accumulation of shorts into an int accumulator.
6098    On some targets it's possible to vectorize this pattern operating on 8
6099    shorts at a time (hence, the vectype for purposes of determining the
6100    vectorization factor should be V8HI); on the other hand, the vectype that
6101    is used to create the vector form is actually V4SI (the type of the result).
6102
6103    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6104    indicates what is the actual level of parallelism (V8HI in the example), so
6105    that the right vectorization factor would be derived.  This vectype
6106    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6107    be used to create the vectorized stmt.  The right vectype for the vectorized
6108    stmt is obtained from the type of the result X:
6109         get_vectype_for_scalar_type (TREE_TYPE (X))
6110
6111    This means that, contrary to "regular" reductions (or "regular" stmts in
6112    general), the following equation:
6113       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6114    does *NOT* necessarily hold for reduction patterns.  */
6115
6116 bool
6117 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6118                         gimple **vec_stmt, slp_tree slp_node,
6119                         slp_instance slp_node_instance,
6120                         stmt_vector_for_cost *cost_vec)
6121 {
6122   tree vec_dest;
6123   tree scalar_dest;
6124   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6125   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6126   tree vectype_in = NULL_TREE;
6127   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6128   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6129   enum tree_code code, orig_code;
6130   internal_fn reduc_fn;
6131   machine_mode vec_mode;
6132   int op_type;
6133   optab optab;
6134   tree new_temp = NULL_TREE;
6135   gimple *def_stmt;
6136   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6137   gimple *cond_reduc_def_stmt = NULL;
6138   enum tree_code cond_reduc_op_code = ERROR_MARK;
6139   tree scalar_type;
6140   bool is_simple_use;
6141   gimple *orig_stmt;
6142   stmt_vec_info orig_stmt_info = NULL;
6143   int i;
6144   int ncopies;
6145   int epilog_copies;
6146   stmt_vec_info prev_stmt_info, prev_phi_info;
6147   bool single_defuse_cycle = false;
6148   gimple *new_stmt = NULL;
6149   int j;
6150   tree ops[3];
6151   enum vect_def_type dts[3];
6152   bool nested_cycle = false, found_nested_cycle_def = false;
6153   bool double_reduc = false;
6154   basic_block def_bb;
6155   struct loop * def_stmt_loop, *outer_loop = NULL;
6156   tree def_arg;
6157   gimple *def_arg_stmt;
6158   auto_vec<tree> vec_oprnds0;
6159   auto_vec<tree> vec_oprnds1;
6160   auto_vec<tree> vec_oprnds2;
6161   auto_vec<tree> vect_defs;
6162   auto_vec<gimple *> phis;
6163   int vec_num;
6164   tree def0, tem;
6165   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6166   tree cond_reduc_val = NULL_TREE;
6167
6168   /* Make sure it was already recognized as a reduction computation.  */
6169   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6170       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6171     return false;
6172
6173   if (nested_in_vect_loop_p (loop, stmt))
6174     {
6175       outer_loop = loop;
6176       loop = loop->inner;
6177       nested_cycle = true;
6178     }
6179
6180   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6181     gcc_assert (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt);
6182
6183   if (gimple_code (stmt) == GIMPLE_PHI)
6184     {
6185       /* Analysis is fully done on the reduction stmt invocation.  */
6186       if (! vec_stmt)
6187         {
6188           if (slp_node)
6189             slp_node_instance->reduc_phis = slp_node;
6190
6191           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6192           return true;
6193         }
6194
6195       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6196         /* Leave the scalar phi in place.  Note that checking
6197            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6198            for reductions involving a single statement.  */
6199         return true;
6200
6201       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6202       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6203         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6204
6205       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6206           == EXTRACT_LAST_REDUCTION)
6207         /* Leave the scalar phi in place.  */
6208         return true;
6209
6210       gcc_assert (is_gimple_assign (reduc_stmt));
6211       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6212         {
6213           tree op = gimple_op (reduc_stmt, k);
6214           if (op == gimple_phi_result (stmt))
6215             continue;
6216           if (k == 1
6217               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6218             continue;
6219           if (!vectype_in
6220               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6221                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6222             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6223           break;
6224         }
6225       gcc_assert (vectype_in);
6226
6227       if (slp_node)
6228         ncopies = 1;
6229       else
6230         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6231
6232       use_operand_p use_p;
6233       gimple *use_stmt;
6234       if (ncopies > 1
6235           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6236               <= vect_used_only_live)
6237           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6238           && (use_stmt == reduc_stmt
6239               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6240                   == reduc_stmt)))
6241         single_defuse_cycle = true;
6242
6243       /* Create the destination vector  */
6244       scalar_dest = gimple_assign_lhs (reduc_stmt);
6245       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6246
6247       if (slp_node)
6248         /* The size vect_schedule_slp_instance computes is off for us.  */
6249         vec_num = vect_get_num_vectors
6250           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6251            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6252            vectype_in);
6253       else
6254         vec_num = 1;
6255
6256       /* Generate the reduction PHIs upfront.  */
6257       prev_phi_info = NULL;
6258       for (j = 0; j < ncopies; j++)
6259         {
6260           if (j == 0 || !single_defuse_cycle)
6261             {
6262               for (i = 0; i < vec_num; i++)
6263                 {
6264                   /* Create the reduction-phi that defines the reduction
6265                      operand.  */
6266                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6267                   set_vinfo_for_stmt (new_phi,
6268                                       new_stmt_vec_info (new_phi, loop_vinfo));
6269
6270                   if (slp_node)
6271                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6272                   else
6273                     {
6274                       if (j == 0)
6275                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6276                       else
6277                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6278                       prev_phi_info = vinfo_for_stmt (new_phi);
6279                     }
6280                 }
6281             }
6282         }
6283
6284       return true;
6285     }
6286
6287   /* 1. Is vectorizable reduction?  */
6288   /* Not supportable if the reduction variable is used in the loop, unless
6289      it's a reduction chain.  */
6290   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6291       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6292     return false;
6293
6294   /* Reductions that are not used even in an enclosing outer-loop,
6295      are expected to be "live" (used out of the loop).  */
6296   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6297       && !STMT_VINFO_LIVE_P (stmt_info))
6298     return false;
6299
6300   /* 2. Has this been recognized as a reduction pattern?
6301
6302      Check if STMT represents a pattern that has been recognized
6303      in earlier analysis stages.  For stmts that represent a pattern,
6304      the STMT_VINFO_RELATED_STMT field records the last stmt in
6305      the original sequence that constitutes the pattern.  */
6306
6307   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6308   if (orig_stmt)
6309     {
6310       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6311       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6312       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6313     }
6314
6315   /* 3. Check the operands of the operation.  The first operands are defined
6316         inside the loop body. The last operand is the reduction variable,
6317         which is defined by the loop-header-phi.  */
6318
6319   gcc_assert (is_gimple_assign (stmt));
6320
6321   /* Flatten RHS.  */
6322   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6323     {
6324     case GIMPLE_BINARY_RHS:
6325       code = gimple_assign_rhs_code (stmt);
6326       op_type = TREE_CODE_LENGTH (code);
6327       gcc_assert (op_type == binary_op);
6328       ops[0] = gimple_assign_rhs1 (stmt);
6329       ops[1] = gimple_assign_rhs2 (stmt);
6330       break;
6331
6332     case GIMPLE_TERNARY_RHS:
6333       code = gimple_assign_rhs_code (stmt);
6334       op_type = TREE_CODE_LENGTH (code);
6335       gcc_assert (op_type == ternary_op);
6336       ops[0] = gimple_assign_rhs1 (stmt);
6337       ops[1] = gimple_assign_rhs2 (stmt);
6338       ops[2] = gimple_assign_rhs3 (stmt);
6339       break;
6340
6341     case GIMPLE_UNARY_RHS:
6342       return false;
6343
6344     default:
6345       gcc_unreachable ();
6346     }
6347
6348   if (code == COND_EXPR && slp_node)
6349     return false;
6350
6351   scalar_dest = gimple_assign_lhs (stmt);
6352   scalar_type = TREE_TYPE (scalar_dest);
6353   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6354       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6355     return false;
6356
6357   /* Do not try to vectorize bit-precision reductions.  */
6358   if (!type_has_mode_precision_p (scalar_type))
6359     return false;
6360
6361   /* All uses but the last are expected to be defined in the loop.
6362      The last use is the reduction variable.  In case of nested cycle this
6363      assumption is not true: we use reduc_index to record the index of the
6364      reduction variable.  */
6365   gimple *reduc_def_stmt = NULL;
6366   int reduc_index = -1;
6367   for (i = 0; i < op_type; i++)
6368     {
6369       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6370       if (i == 0 && code == COND_EXPR)
6371         continue;
6372
6373       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6374                                           &dts[i], &tem, &def_stmt);
6375       dt = dts[i];
6376       gcc_assert (is_simple_use);
6377       if (dt == vect_reduction_def)
6378         {
6379           reduc_def_stmt = def_stmt;
6380           reduc_index = i;
6381           continue;
6382         }
6383       else if (tem)
6384         {
6385           /* To properly compute ncopies we are interested in the widest
6386              input type in case we're looking at a widening accumulation.  */
6387           if (!vectype_in
6388               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6389                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6390             vectype_in = tem;
6391         }
6392
6393       if (dt != vect_internal_def
6394           && dt != vect_external_def
6395           && dt != vect_constant_def
6396           && dt != vect_induction_def
6397           && !(dt == vect_nested_cycle && nested_cycle))
6398         return false;
6399
6400       if (dt == vect_nested_cycle)
6401         {
6402           found_nested_cycle_def = true;
6403           reduc_def_stmt = def_stmt;
6404           reduc_index = i;
6405         }
6406
6407       if (i == 1 && code == COND_EXPR)
6408         {
6409           /* Record how value of COND_EXPR is defined.  */
6410           if (dt == vect_constant_def)
6411             {
6412               cond_reduc_dt = dt;
6413               cond_reduc_val = ops[i];
6414             }
6415           if (dt == vect_induction_def
6416               && def_stmt != NULL
6417               && is_nonwrapping_integer_induction (def_stmt, loop))
6418             {
6419               cond_reduc_dt = dt;
6420               cond_reduc_def_stmt = def_stmt;
6421             }
6422         }
6423     }
6424
6425   if (!vectype_in)
6426     vectype_in = vectype_out;
6427
6428   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6429      directy used in stmt.  */
6430   if (reduc_index == -1)
6431     {
6432       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6433         {
6434           if (dump_enabled_p ())
6435             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6436                              "in-order reduction chain without SLP.\n");
6437           return false;
6438         }
6439
6440       if (orig_stmt)
6441         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6442       else
6443         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6444     }
6445
6446   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6447     return false;
6448
6449   if (!(reduc_index == -1
6450         || dts[reduc_index] == vect_reduction_def
6451         || dts[reduc_index] == vect_nested_cycle
6452         || ((dts[reduc_index] == vect_internal_def
6453              || dts[reduc_index] == vect_external_def
6454              || dts[reduc_index] == vect_constant_def
6455              || dts[reduc_index] == vect_induction_def)
6456             && nested_cycle && found_nested_cycle_def)))
6457     {
6458       /* For pattern recognized stmts, orig_stmt might be a reduction,
6459          but some helper statements for the pattern might not, or
6460          might be COND_EXPRs with reduction uses in the condition.  */
6461       gcc_assert (orig_stmt);
6462       return false;
6463     }
6464
6465   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6466   /* PHIs should not participate in patterns.  */
6467   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6468   enum vect_reduction_type v_reduc_type
6469     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6470   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6471
6472   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6473   /* If we have a condition reduction, see if we can simplify it further.  */
6474   if (v_reduc_type == COND_REDUCTION)
6475     {
6476       /* TODO: We can't yet handle reduction chains, since we need to treat
6477          each COND_EXPR in the chain specially, not just the last one.
6478          E.g. for:
6479
6480             x_1 = PHI <x_3, ...>
6481             x_2 = a_2 ? ... : x_1;
6482             x_3 = a_3 ? ... : x_2;
6483
6484          we're interested in the last element in x_3 for which a_2 || a_3
6485          is true, whereas the current reduction chain handling would
6486          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6487          as a reduction operation.  */
6488       if (reduc_index == -1)
6489         {
6490           if (dump_enabled_p ())
6491             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6492                              "conditional reduction chains not supported\n");
6493           return false;
6494         }
6495
6496       /* vect_is_simple_reduction ensured that operand 2 is the
6497          loop-carried operand.  */
6498       gcc_assert (reduc_index == 2);
6499
6500       /* Loop peeling modifies initial value of reduction PHI, which
6501          makes the reduction stmt to be transformed different to the
6502          original stmt analyzed.  We need to record reduction code for
6503          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6504          it can be used directly at transform stage.  */
6505       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6506           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6507         {
6508           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6509           gcc_assert (cond_reduc_dt == vect_constant_def);
6510           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6511         }
6512       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6513                                                vectype_in, OPTIMIZE_FOR_SPEED))
6514         {
6515           if (dump_enabled_p ())
6516             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6517                              "optimizing condition reduction with"
6518                              " FOLD_EXTRACT_LAST.\n");
6519           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6520         }
6521       else if (cond_reduc_dt == vect_induction_def)
6522         {
6523           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6524           tree base
6525             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6526           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6527
6528           gcc_assert (TREE_CODE (base) == INTEGER_CST
6529                       && TREE_CODE (step) == INTEGER_CST);
6530           cond_reduc_val = NULL_TREE;
6531           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6532              above base; punt if base is the minimum value of the type for
6533              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6534           if (tree_int_cst_sgn (step) == -1)
6535             {
6536               cond_reduc_op_code = MIN_EXPR;
6537               if (tree_int_cst_sgn (base) == -1)
6538                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6539               else if (tree_int_cst_lt (base,
6540                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6541                 cond_reduc_val
6542                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6543             }
6544           else
6545             {
6546               cond_reduc_op_code = MAX_EXPR;
6547               if (tree_int_cst_sgn (base) == 1)
6548                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6549               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6550                                         base))
6551                 cond_reduc_val
6552                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6553             }
6554           if (cond_reduc_val)
6555             {
6556               if (dump_enabled_p ())
6557                 dump_printf_loc (MSG_NOTE, vect_location,
6558                                  "condition expression based on "
6559                                  "integer induction.\n");
6560               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6561                 = INTEGER_INDUC_COND_REDUCTION;
6562             }
6563         }
6564       else if (cond_reduc_dt == vect_constant_def)
6565         {
6566           enum vect_def_type cond_initial_dt;
6567           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6568           tree cond_initial_val
6569             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6570
6571           gcc_assert (cond_reduc_val != NULL_TREE);
6572           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6573           if (cond_initial_dt == vect_constant_def
6574               && types_compatible_p (TREE_TYPE (cond_initial_val),
6575                                      TREE_TYPE (cond_reduc_val)))
6576             {
6577               tree e = fold_binary (LE_EXPR, boolean_type_node,
6578                                     cond_initial_val, cond_reduc_val);
6579               if (e && (integer_onep (e) || integer_zerop (e)))
6580                 {
6581                   if (dump_enabled_p ())
6582                     dump_printf_loc (MSG_NOTE, vect_location,
6583                                      "condition expression based on "
6584                                      "compile time constant.\n");
6585                   /* Record reduction code at analysis stage.  */
6586                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6587                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6588                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6589                     = CONST_COND_REDUCTION;
6590                 }
6591             }
6592         }
6593     }
6594
6595   if (orig_stmt)
6596     gcc_assert (tmp == orig_stmt
6597                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6598                     == orig_stmt));
6599   else
6600     /* We changed STMT to be the first stmt in reduction chain, hence we
6601        check that in this case the first element in the chain is STMT.  */
6602     gcc_assert (stmt == tmp
6603                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6604
6605   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6606     return false;
6607
6608   if (slp_node)
6609     ncopies = 1;
6610   else
6611     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6612
6613   gcc_assert (ncopies >= 1);
6614
6615   vec_mode = TYPE_MODE (vectype_in);
6616   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6617
6618   if (code == COND_EXPR)
6619     {
6620       /* Only call during the analysis stage, otherwise we'll lose
6621          STMT_VINFO_TYPE.  */
6622       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6623                                                 ops[reduc_index], 0, NULL,
6624                                                 cost_vec))
6625         {
6626           if (dump_enabled_p ())
6627             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6628                              "unsupported condition in reduction\n");
6629           return false;
6630         }
6631     }
6632   else
6633     {
6634       /* 4. Supportable by target?  */
6635
6636       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6637           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6638         {
6639           /* Shifts and rotates are only supported by vectorizable_shifts,
6640              not vectorizable_reduction.  */
6641           if (dump_enabled_p ())
6642             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6643                              "unsupported shift or rotation.\n");
6644           return false;
6645         }
6646
6647       /* 4.1. check support for the operation in the loop  */
6648       optab = optab_for_tree_code (code, vectype_in, optab_default);
6649       if (!optab)
6650         {
6651           if (dump_enabled_p ())
6652             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6653                              "no optab.\n");
6654
6655           return false;
6656         }
6657
6658       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6659         {
6660           if (dump_enabled_p ())
6661             dump_printf (MSG_NOTE, "op not supported by target.\n");
6662
6663           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6664               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6665             return false;
6666
6667           if (dump_enabled_p ())
6668             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6669         }
6670
6671       /* Worthwhile without SIMD support?  */
6672       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6673           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6674         {
6675           if (dump_enabled_p ())
6676             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6677                              "not worthwhile without SIMD support.\n");
6678
6679           return false;
6680         }
6681     }
6682
6683   /* 4.2. Check support for the epilog operation.
6684
6685           If STMT represents a reduction pattern, then the type of the
6686           reduction variable may be different than the type of the rest
6687           of the arguments.  For example, consider the case of accumulation
6688           of shorts into an int accumulator; The original code:
6689                         S1: int_a = (int) short_a;
6690           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6691
6692           was replaced with:
6693                         STMT: int_acc = widen_sum <short_a, int_acc>
6694
6695           This means that:
6696           1. The tree-code that is used to create the vector operation in the
6697              epilog code (that reduces the partial results) is not the
6698              tree-code of STMT, but is rather the tree-code of the original
6699              stmt from the pattern that STMT is replacing.  I.e, in the example
6700              above we want to use 'widen_sum' in the loop, but 'plus' in the
6701              epilog.
6702           2. The type (mode) we use to check available target support
6703              for the vector operation to be created in the *epilog*, is
6704              determined by the type of the reduction variable (in the example
6705              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6706              However the type (mode) we use to check available target support
6707              for the vector operation to be created *inside the loop*, is
6708              determined by the type of the other arguments to STMT (in the
6709              example we'd check this: optab_handler (widen_sum_optab,
6710              vect_short_mode)).
6711
6712           This is contrary to "regular" reductions, in which the types of all
6713           the arguments are the same as the type of the reduction variable.
6714           For "regular" reductions we can therefore use the same vector type
6715           (and also the same tree-code) when generating the epilog code and
6716           when generating the code inside the loop.  */
6717
6718   vect_reduction_type reduction_type
6719     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6720   if (orig_stmt
6721       && (reduction_type == TREE_CODE_REDUCTION
6722           || reduction_type == FOLD_LEFT_REDUCTION))
6723     {
6724       /* This is a reduction pattern: get the vectype from the type of the
6725          reduction variable, and get the tree-code from orig_stmt.  */
6726       orig_code = gimple_assign_rhs_code (orig_stmt);
6727       gcc_assert (vectype_out);
6728       vec_mode = TYPE_MODE (vectype_out);
6729     }
6730   else
6731     {
6732       /* Regular reduction: use the same vectype and tree-code as used for
6733          the vector code inside the loop can be used for the epilog code. */
6734       orig_code = code;
6735
6736       if (code == MINUS_EXPR)
6737         orig_code = PLUS_EXPR;
6738
6739       /* For simple condition reductions, replace with the actual expression
6740          we want to base our reduction around.  */
6741       if (reduction_type == CONST_COND_REDUCTION)
6742         {
6743           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6744           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6745         }
6746       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6747         orig_code = cond_reduc_op_code;
6748     }
6749
6750   if (nested_cycle)
6751     {
6752       def_bb = gimple_bb (reduc_def_stmt);
6753       def_stmt_loop = def_bb->loop_father;
6754       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6755                                        loop_preheader_edge (def_stmt_loop));
6756       if (TREE_CODE (def_arg) == SSA_NAME
6757           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6758           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6759           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6760           && vinfo_for_stmt (def_arg_stmt)
6761           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6762               == vect_double_reduction_def)
6763         double_reduc = true;
6764     }
6765
6766   reduc_fn = IFN_LAST;
6767
6768   if (reduction_type == TREE_CODE_REDUCTION
6769       || reduction_type == FOLD_LEFT_REDUCTION
6770       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6771       || reduction_type == CONST_COND_REDUCTION)
6772     {
6773       if (reduction_type == FOLD_LEFT_REDUCTION
6774           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6775           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6776         {
6777           if (reduc_fn != IFN_LAST
6778               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6779                                                   OPTIMIZE_FOR_SPEED))
6780             {
6781               if (dump_enabled_p ())
6782                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6783                                  "reduc op not supported by target.\n");
6784
6785               reduc_fn = IFN_LAST;
6786             }
6787         }
6788       else
6789         {
6790           if (!nested_cycle || double_reduc)
6791             {
6792               if (dump_enabled_p ())
6793                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6794                                  "no reduc code for scalar code.\n");
6795
6796               return false;
6797             }
6798         }
6799     }
6800   else if (reduction_type == COND_REDUCTION)
6801     {
6802       int scalar_precision
6803         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6804       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6805       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6806                                                 nunits_out);
6807
6808       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6809                                           OPTIMIZE_FOR_SPEED))
6810         reduc_fn = IFN_REDUC_MAX;
6811     }
6812
6813   if (reduction_type != EXTRACT_LAST_REDUCTION
6814       && reduc_fn == IFN_LAST
6815       && !nunits_out.is_constant ())
6816     {
6817       if (dump_enabled_p ())
6818         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6819                          "missing target support for reduction on"
6820                          " variable-length vectors.\n");
6821       return false;
6822     }
6823
6824   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6825       && ncopies > 1)
6826     {
6827       if (dump_enabled_p ())
6828         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6829                          "multiple types in double reduction or condition "
6830                          "reduction.\n");
6831       return false;
6832     }
6833
6834   /* For SLP reductions, see if there is a neutral value we can use.  */
6835   tree neutral_op = NULL_TREE;
6836   if (slp_node)
6837     neutral_op = neutral_op_for_slp_reduction
6838                    (slp_node_instance->reduc_phis, code,
6839                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6840
6841   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6842     {
6843       /* We can't support in-order reductions of code such as this:
6844
6845            for (int i = 0; i < n1; ++i)
6846              for (int j = 0; j < n2; ++j)
6847                l += a[j];
6848
6849          since GCC effectively transforms the loop when vectorizing:
6850
6851            for (int i = 0; i < n1 / VF; ++i)
6852              for (int j = 0; j < n2; ++j)
6853                for (int k = 0; k < VF; ++k)
6854                  l += a[j];
6855
6856          which is a reassociation of the original operation.  */
6857       if (dump_enabled_p ())
6858         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6859                          "in-order double reduction not supported.\n");
6860
6861       return false;
6862     }
6863
6864   if (reduction_type == FOLD_LEFT_REDUCTION
6865       && slp_node
6866       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6867     {
6868       /* We cannot use in-order reductions in this case because there is
6869          an implicit reassociation of the operations involved.  */
6870       if (dump_enabled_p ())
6871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6872                          "in-order unchained SLP reductions not supported.\n");
6873       return false;
6874     }
6875
6876   /* For double reductions, and for SLP reductions with a neutral value,
6877      we construct a variable-length initial vector by loading a vector
6878      full of the neutral value and then shift-and-inserting the start
6879      values into the low-numbered elements.  */
6880   if ((double_reduc || neutral_op)
6881       && !nunits_out.is_constant ()
6882       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6883                                           vectype_out, OPTIMIZE_FOR_SPEED))
6884     {
6885       if (dump_enabled_p ())
6886         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6887                          "reduction on variable-length vectors requires"
6888                          " target support for a vector-shift-and-insert"
6889                          " operation.\n");
6890       return false;
6891     }
6892
6893   /* Check extra constraints for variable-length unchained SLP reductions.  */
6894   if (STMT_SLP_TYPE (stmt_info)
6895       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6896       && !nunits_out.is_constant ())
6897     {
6898       /* We checked above that we could build the initial vector when
6899          there's a neutral element value.  Check here for the case in
6900          which each SLP statement has its own initial value and in which
6901          that value needs to be repeated for every instance of the
6902          statement within the initial vector.  */
6903       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6904       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6905       if (!neutral_op
6906           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6907         {
6908           if (dump_enabled_p ())
6909             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6910                              "unsupported form of SLP reduction for"
6911                              " variable-length vectors: cannot build"
6912                              " initial vector.\n");
6913           return false;
6914         }
6915       /* The epilogue code relies on the number of elements being a multiple
6916          of the group size.  The duplicate-and-interleave approach to setting
6917          up the the initial vector does too.  */
6918       if (!multiple_p (nunits_out, group_size))
6919         {
6920           if (dump_enabled_p ())
6921             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922                              "unsupported form of SLP reduction for"
6923                              " variable-length vectors: the vector size"
6924                              " is not a multiple of the number of results.\n");
6925           return false;
6926         }
6927     }
6928
6929   /* In case of widenning multiplication by a constant, we update the type
6930      of the constant to be the type of the other operand.  We check that the
6931      constant fits the type in the pattern recognition pass.  */
6932   if (code == DOT_PROD_EXPR
6933       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6934     {
6935       if (TREE_CODE (ops[0]) == INTEGER_CST)
6936         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6937       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6938         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6939       else
6940         {
6941           if (dump_enabled_p ())
6942             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6943                              "invalid types in dot-prod\n");
6944
6945           return false;
6946         }
6947     }
6948
6949   if (reduction_type == COND_REDUCTION)
6950     {
6951       widest_int ni;
6952
6953       if (! max_loop_iterations (loop, &ni))
6954         {
6955           if (dump_enabled_p ())
6956             dump_printf_loc (MSG_NOTE, vect_location,
6957                              "loop count not known, cannot create cond "
6958                              "reduction.\n");
6959           return false;
6960         }
6961       /* Convert backedges to iterations.  */
6962       ni += 1;
6963
6964       /* The additional index will be the same type as the condition.  Check
6965          that the loop can fit into this less one (because we'll use up the
6966          zero slot for when there are no matches).  */
6967       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6968       if (wi::geu_p (ni, wi::to_widest (max_index)))
6969         {
6970           if (dump_enabled_p ())
6971             dump_printf_loc (MSG_NOTE, vect_location,
6972                              "loop size is greater than data size.\n");
6973           return false;
6974         }
6975     }
6976
6977   /* In case the vectorization factor (VF) is bigger than the number
6978      of elements that we can fit in a vectype (nunits), we have to generate
6979      more than one vector stmt - i.e - we need to "unroll" the
6980      vector stmt by a factor VF/nunits.  For more details see documentation
6981      in vectorizable_operation.  */
6982
6983   /* If the reduction is used in an outer loop we need to generate
6984      VF intermediate results, like so (e.g. for ncopies=2):
6985         r0 = phi (init, r0)
6986         r1 = phi (init, r1)
6987         r0 = x0 + r0;
6988         r1 = x1 + r1;
6989     (i.e. we generate VF results in 2 registers).
6990     In this case we have a separate def-use cycle for each copy, and therefore
6991     for each copy we get the vector def for the reduction variable from the
6992     respective phi node created for this copy.
6993
6994     Otherwise (the reduction is unused in the loop nest), we can combine
6995     together intermediate results, like so (e.g. for ncopies=2):
6996         r = phi (init, r)
6997         r = x0 + r;
6998         r = x1 + r;
6999    (i.e. we generate VF/2 results in a single register).
7000    In this case for each copy we get the vector def for the reduction variable
7001    from the vectorized reduction operation generated in the previous iteration.
7002
7003    This only works when we see both the reduction PHI and its only consumer
7004    in vectorizable_reduction and there are no intermediate stmts
7005    participating.  */
7006   use_operand_p use_p;
7007   gimple *use_stmt;
7008   if (ncopies > 1
7009       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7010       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7011       && (use_stmt == stmt
7012           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7013     {
7014       single_defuse_cycle = true;
7015       epilog_copies = 1;
7016     }
7017   else
7018     epilog_copies = ncopies;
7019
7020   /* If the reduction stmt is one of the patterns that have lane
7021      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7022   if ((ncopies > 1
7023        && ! single_defuse_cycle)
7024       && (code == DOT_PROD_EXPR
7025           || code == WIDEN_SUM_EXPR
7026           || code == SAD_EXPR))
7027     {
7028       if (dump_enabled_p ())
7029         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7030                          "multi def-use cycle not possible for lane-reducing "
7031                          "reduction operation\n");
7032       return false;
7033     }
7034
7035   if (slp_node)
7036     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7037   else
7038     vec_num = 1;
7039
7040   internal_fn cond_fn = get_conditional_internal_fn (code);
7041   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7042
7043   if (!vec_stmt) /* transformation not required.  */
7044     {
7045       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7046       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7047         {
7048           if (reduction_type != FOLD_LEFT_REDUCTION
7049               && (cond_fn == IFN_LAST
7050                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7051                                                       OPTIMIZE_FOR_SPEED)))
7052             {
7053               if (dump_enabled_p ())
7054                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7055                                  "can't use a fully-masked loop because no"
7056                                  " conditional operation is available.\n");
7057               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7058             }
7059           else if (reduc_index == -1)
7060             {
7061               if (dump_enabled_p ())
7062                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7063                                  "can't use a fully-masked loop for chained"
7064                                  " reductions.\n");
7065               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7066             }
7067           else
7068             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7069                                    vectype_in);
7070         }
7071       if (dump_enabled_p ()
7072           && reduction_type == FOLD_LEFT_REDUCTION)
7073         dump_printf_loc (MSG_NOTE, vect_location,
7074                          "using an in-order (fold-left) reduction.\n");
7075       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7076       return true;
7077     }
7078
7079   /* Transform.  */
7080
7081   if (dump_enabled_p ())
7082     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7083
7084   /* FORNOW: Multiple types are not supported for condition.  */
7085   if (code == COND_EXPR)
7086     gcc_assert (ncopies == 1);
7087
7088   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7089
7090   if (reduction_type == FOLD_LEFT_REDUCTION)
7091     return vectorize_fold_left_reduction
7092       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7093        reduc_fn, ops, vectype_in, reduc_index, masks);
7094
7095   if (reduction_type == EXTRACT_LAST_REDUCTION)
7096     {
7097       gcc_assert (!slp_node);
7098       return vectorizable_condition (stmt, gsi, vec_stmt,
7099                                      NULL, reduc_index, NULL, NULL);
7100     }
7101
7102   /* Create the destination vector  */
7103   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7104
7105   prev_stmt_info = NULL;
7106   prev_phi_info = NULL;
7107   if (!slp_node)
7108     {
7109       vec_oprnds0.create (1);
7110       vec_oprnds1.create (1);
7111       if (op_type == ternary_op)
7112         vec_oprnds2.create (1);
7113     }
7114
7115   phis.create (vec_num);
7116   vect_defs.create (vec_num);
7117   if (!slp_node)
7118     vect_defs.quick_push (NULL_TREE);
7119
7120   if (slp_node)
7121     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7122   else
7123     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7124
7125   for (j = 0; j < ncopies; j++)
7126     {
7127       if (code == COND_EXPR)
7128         {
7129           gcc_assert (!slp_node);
7130           vectorizable_condition (stmt, gsi, vec_stmt,
7131                                   PHI_RESULT (phis[0]),
7132                                   reduc_index, NULL, NULL);
7133           /* Multiple types are not supported for condition.  */
7134           break;
7135         }
7136
7137       /* Handle uses.  */
7138       if (j == 0)
7139         {
7140           if (slp_node)
7141             {
7142               /* Get vec defs for all the operands except the reduction index,
7143                  ensuring the ordering of the ops in the vector is kept.  */
7144               auto_vec<tree, 3> slp_ops;
7145               auto_vec<vec<tree>, 3> vec_defs;
7146
7147               slp_ops.quick_push (ops[0]);
7148               slp_ops.quick_push (ops[1]);
7149               if (op_type == ternary_op)
7150                 slp_ops.quick_push (ops[2]);
7151
7152               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7153
7154               vec_oprnds0.safe_splice (vec_defs[0]);
7155               vec_defs[0].release ();
7156               vec_oprnds1.safe_splice (vec_defs[1]);
7157               vec_defs[1].release ();
7158               if (op_type == ternary_op)
7159                 {
7160                   vec_oprnds2.safe_splice (vec_defs[2]);
7161                   vec_defs[2].release ();
7162                 }
7163             }
7164           else
7165             {
7166               vec_oprnds0.quick_push
7167                 (vect_get_vec_def_for_operand (ops[0], stmt));
7168               vec_oprnds1.quick_push
7169                 (vect_get_vec_def_for_operand (ops[1], stmt));
7170               if (op_type == ternary_op)
7171                 vec_oprnds2.quick_push
7172                   (vect_get_vec_def_for_operand (ops[2], stmt));
7173             }
7174         }
7175       else
7176         {
7177           if (!slp_node)
7178             {
7179               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7180
7181               if (single_defuse_cycle && reduc_index == 0)
7182                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7183               else
7184                 vec_oprnds0[0]
7185                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7186               if (single_defuse_cycle && reduc_index == 1)
7187                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7188               else
7189                 vec_oprnds1[0]
7190                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7191               if (op_type == ternary_op)
7192                 {
7193                   if (single_defuse_cycle && reduc_index == 2)
7194                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7195                   else
7196                     vec_oprnds2[0]
7197                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7198                 }
7199             }
7200         }
7201
7202       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7203         {
7204           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7205           if (masked_loop_p)
7206             {
7207               /* Make sure that the reduction accumulator is vop[0].  */
7208               if (reduc_index == 1)
7209                 {
7210                   gcc_assert (commutative_tree_code (code));
7211                   std::swap (vop[0], vop[1]);
7212                 }
7213               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7214                                               vectype_in, i * ncopies + j);
7215               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7216                                                         vop[0], vop[1],
7217                                                         vop[0]);
7218               new_temp = make_ssa_name (vec_dest, call);
7219               gimple_call_set_lhs (call, new_temp);
7220               gimple_call_set_nothrow (call, true);
7221               new_stmt = call;
7222             }
7223           else
7224             {
7225               if (op_type == ternary_op)
7226                 vop[2] = vec_oprnds2[i];
7227
7228               new_temp = make_ssa_name (vec_dest, new_stmt);
7229               new_stmt = gimple_build_assign (new_temp, code,
7230                                               vop[0], vop[1], vop[2]);
7231             }
7232           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7233
7234           if (slp_node)
7235             {
7236               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7237               vect_defs.quick_push (new_temp);
7238             }
7239           else
7240             vect_defs[0] = new_temp;
7241         }
7242
7243       if (slp_node)
7244         continue;
7245
7246       if (j == 0)
7247         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7248       else
7249         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7250
7251       prev_stmt_info = vinfo_for_stmt (new_stmt);
7252     }
7253
7254   /* Finalize the reduction-phi (set its arguments) and create the
7255      epilog reduction code.  */
7256   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7257     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7258
7259   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7260                                     epilog_copies, reduc_fn, phis,
7261                                     double_reduc, slp_node, slp_node_instance,
7262                                     cond_reduc_val, cond_reduc_op_code,
7263                                     neutral_op);
7264
7265   return true;
7266 }
7267
7268 /* Function vect_min_worthwhile_factor.
7269
7270    For a loop where we could vectorize the operation indicated by CODE,
7271    return the minimum vectorization factor that makes it worthwhile
7272    to use generic vectors.  */
7273 static unsigned int
7274 vect_min_worthwhile_factor (enum tree_code code)
7275 {
7276   switch (code)
7277     {
7278     case PLUS_EXPR:
7279     case MINUS_EXPR:
7280     case NEGATE_EXPR:
7281       return 4;
7282
7283     case BIT_AND_EXPR:
7284     case BIT_IOR_EXPR:
7285     case BIT_XOR_EXPR:
7286     case BIT_NOT_EXPR:
7287       return 2;
7288
7289     default:
7290       return INT_MAX;
7291     }
7292 }
7293
7294 /* Return true if VINFO indicates we are doing loop vectorization and if
7295    it is worth decomposing CODE operations into scalar operations for
7296    that loop's vectorization factor.  */
7297
7298 bool
7299 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7300 {
7301   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7302   unsigned HOST_WIDE_INT value;
7303   return (loop_vinfo
7304           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7305           && value >= vect_min_worthwhile_factor (code));
7306 }
7307
7308 /* Function vectorizable_induction
7309
7310    Check if PHI performs an induction computation that can be vectorized.
7311    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7312    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7313    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7314
7315 bool
7316 vectorizable_induction (gimple *phi,
7317                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7318                         gimple **vec_stmt, slp_tree slp_node,
7319                         stmt_vector_for_cost *cost_vec)
7320 {
7321   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7322   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7323   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7324   unsigned ncopies;
7325   bool nested_in_vect_loop = false;
7326   struct loop *iv_loop;
7327   tree vec_def;
7328   edge pe = loop_preheader_edge (loop);
7329   basic_block new_bb;
7330   tree new_vec, vec_init, vec_step, t;
7331   tree new_name;
7332   gimple *new_stmt;
7333   gphi *induction_phi;
7334   tree induc_def, vec_dest;
7335   tree init_expr, step_expr;
7336   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7337   unsigned i;
7338   tree expr;
7339   gimple_seq stmts;
7340   imm_use_iterator imm_iter;
7341   use_operand_p use_p;
7342   gimple *exit_phi;
7343   edge latch_e;
7344   tree loop_arg;
7345   gimple_stmt_iterator si;
7346   basic_block bb = gimple_bb (phi);
7347
7348   if (gimple_code (phi) != GIMPLE_PHI)
7349     return false;
7350
7351   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7352     return false;
7353
7354   /* Make sure it was recognized as induction computation.  */
7355   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7356     return false;
7357
7358   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7359   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7360
7361   if (slp_node)
7362     ncopies = 1;
7363   else
7364     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7365   gcc_assert (ncopies >= 1);
7366
7367   /* FORNOW. These restrictions should be relaxed.  */
7368   if (nested_in_vect_loop_p (loop, phi))
7369     {
7370       imm_use_iterator imm_iter;
7371       use_operand_p use_p;
7372       gimple *exit_phi;
7373       edge latch_e;
7374       tree loop_arg;
7375
7376       if (ncopies > 1)
7377         {
7378           if (dump_enabled_p ())
7379             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7380                              "multiple types in nested loop.\n");
7381           return false;
7382         }
7383
7384       /* FORNOW: outer loop induction with SLP not supported.  */
7385       if (STMT_SLP_TYPE (stmt_info))
7386         return false;
7387
7388       exit_phi = NULL;
7389       latch_e = loop_latch_edge (loop->inner);
7390       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7391       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7392         {
7393           gimple *use_stmt = USE_STMT (use_p);
7394           if (is_gimple_debug (use_stmt))
7395             continue;
7396
7397           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7398             {
7399               exit_phi = use_stmt;
7400               break;
7401             }
7402         }
7403       if (exit_phi)
7404         {
7405           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7406           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7407                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7408             {
7409               if (dump_enabled_p ())
7410                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7411                                  "inner-loop induction only used outside "
7412                                  "of the outer vectorized loop.\n");
7413               return false;
7414             }
7415         }
7416
7417       nested_in_vect_loop = true;
7418       iv_loop = loop->inner;
7419     }
7420   else
7421     iv_loop = loop;
7422   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7423
7424   if (slp_node && !nunits.is_constant ())
7425     {
7426       /* The current SLP code creates the initial value element-by-element.  */
7427       if (dump_enabled_p ())
7428         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7429                          "SLP induction not supported for variable-length"
7430                          " vectors.\n");
7431       return false;
7432     }
7433
7434   if (!vec_stmt) /* transformation not required.  */
7435     {
7436       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7437       DUMP_VECT_SCOPE ("vectorizable_induction");
7438       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7439       return true;
7440     }
7441
7442   /* Transform.  */
7443
7444   /* Compute a vector variable, initialized with the first VF values of
7445      the induction variable.  E.g., for an iv with IV_PHI='X' and
7446      evolution S, for a vector of 4 units, we want to compute:
7447      [X, X + S, X + 2*S, X + 3*S].  */
7448
7449   if (dump_enabled_p ())
7450     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7451
7452   latch_e = loop_latch_edge (iv_loop);
7453   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7454
7455   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7456   gcc_assert (step_expr != NULL_TREE);
7457
7458   pe = loop_preheader_edge (iv_loop);
7459   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7460                                      loop_preheader_edge (iv_loop));
7461
7462   stmts = NULL;
7463   if (!nested_in_vect_loop)
7464     {
7465       /* Convert the initial value to the desired type.  */
7466       tree new_type = TREE_TYPE (vectype);
7467       init_expr = gimple_convert (&stmts, new_type, init_expr);
7468
7469       /* If we are using the loop mask to "peel" for alignment then we need
7470          to adjust the start value here.  */
7471       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7472       if (skip_niters != NULL_TREE)
7473         {
7474           if (FLOAT_TYPE_P (vectype))
7475             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7476                                         skip_niters);
7477           else
7478             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7479           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7480                                          skip_niters, step_expr);
7481           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7482                                     init_expr, skip_step);
7483         }
7484     }
7485
7486   /* Convert the step to the desired type.  */
7487   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7488
7489   if (stmts)
7490     {
7491       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7492       gcc_assert (!new_bb);
7493     }
7494
7495   /* Find the first insertion point in the BB.  */
7496   si = gsi_after_labels (bb);
7497
7498   /* For SLP induction we have to generate several IVs as for example
7499      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7500      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7501      [VF*S, VF*S, VF*S, VF*S] for all.  */
7502   if (slp_node)
7503     {
7504       /* Enforced above.  */
7505       unsigned int const_nunits = nunits.to_constant ();
7506
7507       /* Generate [VF*S, VF*S, ... ].  */
7508       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7509         {
7510           expr = build_int_cst (integer_type_node, vf);
7511           expr = fold_convert (TREE_TYPE (step_expr), expr);
7512         }
7513       else
7514         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7515       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7516                               expr, step_expr);
7517       if (! CONSTANT_CLASS_P (new_name))
7518         new_name = vect_init_vector (phi, new_name,
7519                                      TREE_TYPE (step_expr), NULL);
7520       new_vec = build_vector_from_val (vectype, new_name);
7521       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7522
7523       /* Now generate the IVs.  */
7524       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7525       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7526       unsigned elts = const_nunits * nvects;
7527       unsigned nivs = least_common_multiple (group_size,
7528                                              const_nunits) / const_nunits;
7529       gcc_assert (elts % group_size == 0);
7530       tree elt = init_expr;
7531       unsigned ivn;
7532       for (ivn = 0; ivn < nivs; ++ivn)
7533         {
7534           tree_vector_builder elts (vectype, const_nunits, 1);
7535           stmts = NULL;
7536           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7537             {
7538               if (ivn*const_nunits + eltn >= group_size
7539                   && (ivn * const_nunits + eltn) % group_size == 0)
7540                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7541                                     elt, step_expr);
7542               elts.quick_push (elt);
7543             }
7544           vec_init = gimple_build_vector (&stmts, &elts);
7545           if (stmts)
7546             {
7547               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7548               gcc_assert (!new_bb);
7549             }
7550
7551           /* Create the induction-phi that defines the induction-operand.  */
7552           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7553           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7554           set_vinfo_for_stmt (induction_phi,
7555                               new_stmt_vec_info (induction_phi, loop_vinfo));
7556           induc_def = PHI_RESULT (induction_phi);
7557
7558           /* Create the iv update inside the loop  */
7559           vec_def = make_ssa_name (vec_dest);
7560           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7561           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7562           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7563
7564           /* Set the arguments of the phi node:  */
7565           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7566           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7567                        UNKNOWN_LOCATION);
7568
7569           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7570         }
7571
7572       /* Re-use IVs when we can.  */
7573       if (ivn < nvects)
7574         {
7575           unsigned vfp
7576             = least_common_multiple (group_size, const_nunits) / group_size;
7577           /* Generate [VF'*S, VF'*S, ... ].  */
7578           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7579             {
7580               expr = build_int_cst (integer_type_node, vfp);
7581               expr = fold_convert (TREE_TYPE (step_expr), expr);
7582             }
7583           else
7584             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7585           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7586                                   expr, step_expr);
7587           if (! CONSTANT_CLASS_P (new_name))
7588             new_name = vect_init_vector (phi, new_name,
7589                                          TREE_TYPE (step_expr), NULL);
7590           new_vec = build_vector_from_val (vectype, new_name);
7591           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7592           for (; ivn < nvects; ++ivn)
7593             {
7594               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7595               tree def;
7596               if (gimple_code (iv) == GIMPLE_PHI)
7597                 def = gimple_phi_result (iv);
7598               else
7599                 def = gimple_assign_lhs (iv);
7600               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7601                                               PLUS_EXPR,
7602                                               def, vec_step);
7603               if (gimple_code (iv) == GIMPLE_PHI)
7604                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7605               else
7606                 {
7607                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7608                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7609                 }
7610               set_vinfo_for_stmt (new_stmt,
7611                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7612               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7613             }
7614         }
7615
7616       return true;
7617     }
7618
7619   /* Create the vector that holds the initial_value of the induction.  */
7620   if (nested_in_vect_loop)
7621     {
7622       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7623          been created during vectorization of previous stmts.  We obtain it
7624          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7625       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7626       /* If the initial value is not of proper type, convert it.  */
7627       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7628         {
7629           new_stmt
7630             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7631                                                           vect_simple_var,
7632                                                           "vec_iv_"),
7633                                    VIEW_CONVERT_EXPR,
7634                                    build1 (VIEW_CONVERT_EXPR, vectype,
7635                                            vec_init));
7636           vec_init = gimple_assign_lhs (new_stmt);
7637           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7638                                                  new_stmt);
7639           gcc_assert (!new_bb);
7640           set_vinfo_for_stmt (new_stmt,
7641                               new_stmt_vec_info (new_stmt, loop_vinfo));
7642         }
7643     }
7644   else
7645     {
7646       /* iv_loop is the loop to be vectorized. Create:
7647          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7648       stmts = NULL;
7649       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7650
7651       unsigned HOST_WIDE_INT const_nunits;
7652       if (nunits.is_constant (&const_nunits))
7653         {
7654           tree_vector_builder elts (vectype, const_nunits, 1);
7655           elts.quick_push (new_name);
7656           for (i = 1; i < const_nunits; i++)
7657             {
7658               /* Create: new_name_i = new_name + step_expr  */
7659               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7660                                        new_name, step_expr);
7661               elts.quick_push (new_name);
7662             }
7663           /* Create a vector from [new_name_0, new_name_1, ...,
7664              new_name_nunits-1]  */
7665           vec_init = gimple_build_vector (&stmts, &elts);
7666         }
7667       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7668         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7669         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7670                                  new_name, step_expr);
7671       else
7672         {
7673           /* Build:
7674                 [base, base, base, ...]
7675                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7676           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7677           gcc_assert (flag_associative_math);
7678           tree index = build_index_vector (vectype, 0, 1);
7679           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7680                                                         new_name);
7681           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7682                                                         step_expr);
7683           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7684           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7685                                    vec_init, step_vec);
7686           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7687                                    vec_init, base_vec);
7688         }
7689
7690       if (stmts)
7691         {
7692           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7693           gcc_assert (!new_bb);
7694         }
7695     }
7696
7697
7698   /* Create the vector that holds the step of the induction.  */
7699   if (nested_in_vect_loop)
7700     /* iv_loop is nested in the loop to be vectorized. Generate:
7701        vec_step = [S, S, S, S]  */
7702     new_name = step_expr;
7703   else
7704     {
7705       /* iv_loop is the loop to be vectorized. Generate:
7706           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7707       gimple_seq seq = NULL;
7708       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7709         {
7710           expr = build_int_cst (integer_type_node, vf);
7711           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7712         }
7713       else
7714         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7715       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7716                                expr, step_expr);
7717       if (seq)
7718         {
7719           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7720           gcc_assert (!new_bb);
7721         }
7722     }
7723
7724   t = unshare_expr (new_name);
7725   gcc_assert (CONSTANT_CLASS_P (new_name)
7726               || TREE_CODE (new_name) == SSA_NAME);
7727   new_vec = build_vector_from_val (vectype, t);
7728   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7729
7730
7731   /* Create the following def-use cycle:
7732      loop prolog:
7733          vec_init = ...
7734          vec_step = ...
7735      loop:
7736          vec_iv = PHI <vec_init, vec_loop>
7737          ...
7738          STMT
7739          ...
7740          vec_loop = vec_iv + vec_step;  */
7741
7742   /* Create the induction-phi that defines the induction-operand.  */
7743   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7744   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7745   set_vinfo_for_stmt (induction_phi,
7746                       new_stmt_vec_info (induction_phi, loop_vinfo));
7747   induc_def = PHI_RESULT (induction_phi);
7748
7749   /* Create the iv update inside the loop  */
7750   vec_def = make_ssa_name (vec_dest);
7751   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7752   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7753   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7754
7755   /* Set the arguments of the phi node:  */
7756   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7757   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7758                UNKNOWN_LOCATION);
7759
7760   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7761
7762   /* In case that vectorization factor (VF) is bigger than the number
7763      of elements that we can fit in a vectype (nunits), we have to generate
7764      more than one vector stmt - i.e - we need to "unroll" the
7765      vector stmt by a factor VF/nunits.  For more details see documentation
7766      in vectorizable_operation.  */
7767
7768   if (ncopies > 1)
7769     {
7770       gimple_seq seq = NULL;
7771       stmt_vec_info prev_stmt_vinfo;
7772       /* FORNOW. This restriction should be relaxed.  */
7773       gcc_assert (!nested_in_vect_loop);
7774
7775       /* Create the vector that holds the step of the induction.  */
7776       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7777         {
7778           expr = build_int_cst (integer_type_node, nunits);
7779           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7780         }
7781       else
7782         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7783       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7784                                expr, step_expr);
7785       if (seq)
7786         {
7787           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7788           gcc_assert (!new_bb);
7789         }
7790
7791       t = unshare_expr (new_name);
7792       gcc_assert (CONSTANT_CLASS_P (new_name)
7793                   || TREE_CODE (new_name) == SSA_NAME);
7794       new_vec = build_vector_from_val (vectype, t);
7795       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7796
7797       vec_def = induc_def;
7798       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7799       for (i = 1; i < ncopies; i++)
7800         {
7801           /* vec_i = vec_prev + vec_step  */
7802           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7803                                           vec_def, vec_step);
7804           vec_def = make_ssa_name (vec_dest, new_stmt);
7805           gimple_assign_set_lhs (new_stmt, vec_def);
7806
7807           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7808           set_vinfo_for_stmt (new_stmt,
7809                               new_stmt_vec_info (new_stmt, loop_vinfo));
7810           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7811           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7812         }
7813     }
7814
7815   if (nested_in_vect_loop)
7816     {
7817       /* Find the loop-closed exit-phi of the induction, and record
7818          the final vector of induction results:  */
7819       exit_phi = NULL;
7820       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7821         {
7822           gimple *use_stmt = USE_STMT (use_p);
7823           if (is_gimple_debug (use_stmt))
7824             continue;
7825
7826           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7827             {
7828               exit_phi = use_stmt;
7829               break;
7830             }
7831         }
7832       if (exit_phi)
7833         {
7834           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7835           /* FORNOW. Currently not supporting the case that an inner-loop induction
7836              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7837           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7838                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7839
7840           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7841           if (dump_enabled_p ())
7842             {
7843               dump_printf_loc (MSG_NOTE, vect_location,
7844                                "vector of inductions after inner-loop:");
7845               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7846             }
7847         }
7848     }
7849
7850
7851   if (dump_enabled_p ())
7852     {
7853       dump_printf_loc (MSG_NOTE, vect_location,
7854                        "transform induction: created def-use cycle: ");
7855       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7856       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7857                         SSA_NAME_DEF_STMT (vec_def), 0);
7858     }
7859
7860   return true;
7861 }
7862
7863 /* Function vectorizable_live_operation.
7864
7865    STMT computes a value that is used outside the loop.  Check if
7866    it can be supported.  */
7867
7868 bool
7869 vectorizable_live_operation (gimple *stmt,
7870                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7871                              slp_tree slp_node, int slp_index,
7872                              gimple **vec_stmt,
7873                              stmt_vector_for_cost *)
7874 {
7875   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7876   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7877   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7878   imm_use_iterator imm_iter;
7879   tree lhs, lhs_type, bitsize, vec_bitsize;
7880   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7881   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7882   int ncopies;
7883   gimple *use_stmt;
7884   auto_vec<tree> vec_oprnds;
7885   int vec_entry = 0;
7886   poly_uint64 vec_index = 0;
7887
7888   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7889
7890   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7891     return false;
7892
7893   /* FORNOW.  CHECKME.  */
7894   if (nested_in_vect_loop_p (loop, stmt))
7895     return false;
7896
7897   /* If STMT is not relevant and it is a simple assignment and its inputs are
7898      invariant then it can remain in place, unvectorized.  The original last
7899      scalar value that it computes will be used.  */
7900   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7901     {
7902       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7903       if (dump_enabled_p ())
7904         dump_printf_loc (MSG_NOTE, vect_location,
7905                          "statement is simple and uses invariant.  Leaving in "
7906                          "place.\n");
7907       return true;
7908     }
7909
7910   if (slp_node)
7911     ncopies = 1;
7912   else
7913     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7914
7915   if (slp_node)
7916     {
7917       gcc_assert (slp_index >= 0);
7918
7919       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7920       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7921
7922       /* Get the last occurrence of the scalar index from the concatenation of
7923          all the slp vectors. Calculate which slp vector it is and the index
7924          within.  */
7925       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7926
7927       /* Calculate which vector contains the result, and which lane of
7928          that vector we need.  */
7929       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7930         {
7931           if (dump_enabled_p ())
7932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7933                              "Cannot determine which vector holds the"
7934                              " final result.\n");
7935           return false;
7936         }
7937     }
7938
7939   if (!vec_stmt)
7940     {
7941       /* No transformation required.  */
7942       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7943         {
7944           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7945                                                OPTIMIZE_FOR_SPEED))
7946             {
7947               if (dump_enabled_p ())
7948                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7949                                  "can't use a fully-masked loop because "
7950                                  "the target doesn't support extract last "
7951                                  "reduction.\n");
7952               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7953             }
7954           else if (slp_node)
7955             {
7956               if (dump_enabled_p ())
7957                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7958                                  "can't use a fully-masked loop because an "
7959                                  "SLP statement is live after the loop.\n");
7960               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7961             }
7962           else if (ncopies > 1)
7963             {
7964               if (dump_enabled_p ())
7965                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966                                  "can't use a fully-masked loop because"
7967                                  " ncopies is greater than 1.\n");
7968               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7969             }
7970           else
7971             {
7972               gcc_assert (ncopies == 1 && !slp_node);
7973               vect_record_loop_mask (loop_vinfo,
7974                                      &LOOP_VINFO_MASKS (loop_vinfo),
7975                                      1, vectype);
7976             }
7977         }
7978       return true;
7979     }
7980
7981   /* If stmt has a related stmt, then use that for getting the lhs.  */
7982   if (is_pattern_stmt_p (stmt_info))
7983     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7984
7985   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7986         : gimple_get_lhs (stmt);
7987   lhs_type = TREE_TYPE (lhs);
7988
7989   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7990              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7991              : TYPE_SIZE (TREE_TYPE (vectype)));
7992   vec_bitsize = TYPE_SIZE (vectype);
7993
7994   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7995   tree vec_lhs, bitstart;
7996   if (slp_node)
7997     {
7998       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7999
8000       /* Get the correct slp vectorized stmt.  */
8001       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8002       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8003         vec_lhs = gimple_phi_result (phi);
8004       else
8005         vec_lhs = gimple_get_lhs (vec_stmt);
8006
8007       /* Get entry to use.  */
8008       bitstart = bitsize_int (vec_index);
8009       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8010     }
8011   else
8012     {
8013       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8014       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8015       gcc_checking_assert (ncopies == 1
8016                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8017
8018       /* For multiple copies, get the last copy.  */
8019       for (int i = 1; i < ncopies; ++i)
8020         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8021                                                   vec_lhs);
8022
8023       /* Get the last lane in the vector.  */
8024       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8025     }
8026
8027   gimple_seq stmts = NULL;
8028   tree new_tree;
8029   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8030     {
8031       /* Emit:
8032
8033            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8034
8035          where VEC_LHS is the vectorized live-out result and MASK is
8036          the loop mask for the final iteration.  */
8037       gcc_assert (ncopies == 1 && !slp_node);
8038       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8039       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8040                                       1, vectype, 0);
8041       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8042                                       scalar_type, mask, vec_lhs);
8043
8044       /* Convert the extracted vector element to the required scalar type.  */
8045       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8046     }
8047   else
8048     {
8049       tree bftype = TREE_TYPE (vectype);
8050       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8051         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8052       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8053       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8054                                        &stmts, true, NULL_TREE);
8055     }
8056
8057   if (stmts)
8058     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8059
8060   /* Replace use of lhs with newly computed result.  If the use stmt is a
8061      single arg PHI, just replace all uses of PHI result.  It's necessary
8062      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8063   use_operand_p use_p;
8064   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8065     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8066         && !is_gimple_debug (use_stmt))
8067     {
8068       if (gimple_code (use_stmt) == GIMPLE_PHI
8069           && gimple_phi_num_args (use_stmt) == 1)
8070         {
8071           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8072         }
8073       else
8074         {
8075           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8076             SET_USE (use_p, new_tree);
8077         }
8078       update_stmt (use_stmt);
8079     }
8080
8081   return true;
8082 }
8083
8084 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8085
8086 static void
8087 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8088 {
8089   ssa_op_iter op_iter;
8090   imm_use_iterator imm_iter;
8091   def_operand_p def_p;
8092   gimple *ustmt;
8093
8094   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8095     {
8096       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8097         {
8098           basic_block bb;
8099
8100           if (!is_gimple_debug (ustmt))
8101             continue;
8102
8103           bb = gimple_bb (ustmt);
8104
8105           if (!flow_bb_inside_loop_p (loop, bb))
8106             {
8107               if (gimple_debug_bind_p (ustmt))
8108                 {
8109                   if (dump_enabled_p ())
8110                     dump_printf_loc (MSG_NOTE, vect_location,
8111                                      "killing debug use\n");
8112
8113                   gimple_debug_bind_reset_value (ustmt);
8114                   update_stmt (ustmt);
8115                 }
8116               else
8117                 gcc_unreachable ();
8118             }
8119         }
8120     }
8121 }
8122
8123 /* Given loop represented by LOOP_VINFO, return true if computation of
8124    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8125    otherwise.  */
8126
8127 static bool
8128 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8129 {
8130   /* Constant case.  */
8131   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8132     {
8133       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8134       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8135
8136       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8137       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8138       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8139         return true;
8140     }
8141
8142   widest_int max;
8143   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8144   /* Check the upper bound of loop niters.  */
8145   if (get_max_loop_iterations (loop, &max))
8146     {
8147       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8148       signop sgn = TYPE_SIGN (type);
8149       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8150       if (max < type_max)
8151         return true;
8152     }
8153   return false;
8154 }
8155
8156 /* Return a mask type with half the number of elements as TYPE.  */
8157
8158 tree
8159 vect_halve_mask_nunits (tree type)
8160 {
8161   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8162   return build_truth_vector_type (nunits, current_vector_size);
8163 }
8164
8165 /* Return a mask type with twice as many elements as TYPE.  */
8166
8167 tree
8168 vect_double_mask_nunits (tree type)
8169 {
8170   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8171   return build_truth_vector_type (nunits, current_vector_size);
8172 }
8173
8174 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8175    contain a sequence of NVECTORS masks that each control a vector of type
8176    VECTYPE.  */
8177
8178 void
8179 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8180                        unsigned int nvectors, tree vectype)
8181 {
8182   gcc_assert (nvectors != 0);
8183   if (masks->length () < nvectors)
8184     masks->safe_grow_cleared (nvectors);
8185   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8186   /* The number of scalars per iteration and the number of vectors are
8187      both compile-time constants.  */
8188   unsigned int nscalars_per_iter
8189     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8190                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8191   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8192     {
8193       rgm->max_nscalars_per_iter = nscalars_per_iter;
8194       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8195     }
8196 }
8197
8198 /* Given a complete set of masks MASKS, extract mask number INDEX
8199    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8200    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8201
8202    See the comment above vec_loop_masks for more details about the mask
8203    arrangement.  */
8204
8205 tree
8206 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8207                     unsigned int nvectors, tree vectype, unsigned int index)
8208 {
8209   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8210   tree mask_type = rgm->mask_type;
8211
8212   /* Populate the rgroup's mask array, if this is the first time we've
8213      used it.  */
8214   if (rgm->masks.is_empty ())
8215     {
8216       rgm->masks.safe_grow_cleared (nvectors);
8217       for (unsigned int i = 0; i < nvectors; ++i)
8218         {
8219           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8220           /* Provide a dummy definition until the real one is available.  */
8221           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8222           rgm->masks[i] = mask;
8223         }
8224     }
8225
8226   tree mask = rgm->masks[index];
8227   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8228                 TYPE_VECTOR_SUBPARTS (vectype)))
8229     {
8230       /* A loop mask for data type X can be reused for data type Y
8231          if X has N times more elements than Y and if Y's elements
8232          are N times bigger than X's.  In this case each sequence
8233          of N elements in the loop mask will be all-zero or all-one.
8234          We can then view-convert the mask so that each sequence of
8235          N elements is replaced by a single element.  */
8236       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8237                               TYPE_VECTOR_SUBPARTS (vectype)));
8238       gimple_seq seq = NULL;
8239       mask_type = build_same_sized_truth_vector_type (vectype);
8240       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8241       if (seq)
8242         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8243     }
8244   return mask;
8245 }
8246
8247 /* Scale profiling counters by estimation for LOOP which is vectorized
8248    by factor VF.  */
8249
8250 static void
8251 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8252 {
8253   edge preheader = loop_preheader_edge (loop);
8254   /* Reduce loop iterations by the vectorization factor.  */
8255   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8256   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8257
8258   if (freq_h.nonzero_p ())
8259     {
8260       profile_probability p;
8261
8262       /* Avoid dropping loop body profile counter to 0 because of zero count
8263          in loop's preheader.  */
8264       if (!(freq_e == profile_count::zero ()))
8265         freq_e = freq_e.force_nonzero ();
8266       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8267       scale_loop_frequencies (loop, p);
8268     }
8269
8270   edge exit_e = single_exit (loop);
8271   exit_e->probability = profile_probability::always ()
8272                                  .apply_scale (1, new_est_niter + 1);
8273
8274   edge exit_l = single_pred_edge (loop->latch);
8275   profile_probability prob = exit_l->probability;
8276   exit_l->probability = exit_e->probability.invert ();
8277   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8278     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8279 }
8280
8281 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8282    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8283    *SLP_SCHEDULE is a running record of whether we have called
8284    vect_schedule_slp.  */
8285
8286 static void
8287 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8288                           gimple_stmt_iterator *gsi,
8289                           stmt_vec_info *seen_store, bool *slp_scheduled)
8290 {
8291   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8292   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8293   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8294   if (!stmt_info)
8295     return;
8296
8297   if (dump_enabled_p ())
8298     {
8299       dump_printf_loc (MSG_NOTE, vect_location,
8300                        "------>vectorizing statement: ");
8301       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8302     }
8303
8304   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8305     vect_loop_kill_debug_uses (loop, stmt);
8306
8307   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8308       && !STMT_VINFO_LIVE_P (stmt_info))
8309     return;
8310
8311   if (STMT_VINFO_VECTYPE (stmt_info))
8312     {
8313       poly_uint64 nunits
8314         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8315       if (!STMT_SLP_TYPE (stmt_info)
8316           && maybe_ne (nunits, vf)
8317           && dump_enabled_p ())
8318         /* For SLP VF is set according to unrolling factor, and not
8319            to vector size, hence for SLP this print is not valid.  */
8320         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8321     }
8322
8323   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8324      reached.  */
8325   if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8326     {
8327
8328       if (!*slp_scheduled)
8329         {
8330           *slp_scheduled = true;
8331
8332           DUMP_VECT_SCOPE ("scheduling SLP instances");
8333
8334           vect_schedule_slp (loop_vinfo);
8335         }
8336
8337       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8338       if (slptype == pure_slp)
8339         return;
8340     }
8341
8342   if (dump_enabled_p ())
8343     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8344
8345   bool grouped_store = false;
8346   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8347     *seen_store = stmt_info;
8348 }
8349
8350 /* Function vect_transform_loop.
8351
8352    The analysis phase has determined that the loop is vectorizable.
8353    Vectorize the loop - created vectorized stmts to replace the scalar
8354    stmts in the loop, and update the loop exit condition.
8355    Returns scalar epilogue loop if any.  */
8356
8357 struct loop *
8358 vect_transform_loop (loop_vec_info loop_vinfo)
8359 {
8360   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8361   struct loop *epilogue = NULL;
8362   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8363   int nbbs = loop->num_nodes;
8364   int i;
8365   tree niters_vector = NULL_TREE;
8366   tree step_vector = NULL_TREE;
8367   tree niters_vector_mult_vf = NULL_TREE;
8368   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8369   unsigned int lowest_vf = constant_lower_bound (vf);
8370   bool slp_scheduled = false;
8371   gimple *stmt;
8372   bool check_profitability = false;
8373   unsigned int th;
8374
8375   DUMP_VECT_SCOPE ("vec_transform_loop");
8376
8377   loop_vinfo->shared->check_datarefs ();
8378
8379   /* Use the more conservative vectorization threshold.  If the number
8380      of iterations is constant assume the cost check has been performed
8381      by our caller.  If the threshold makes all loops profitable that
8382      run at least the (estimated) vectorization factor number of times
8383      checking is pointless, too.  */
8384   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8385   if (th >= vect_vf_for_cost (loop_vinfo)
8386       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8387     {
8388       if (dump_enabled_p ())
8389         dump_printf_loc (MSG_NOTE, vect_location,
8390                          "Profitability threshold is %d loop iterations.\n",
8391                          th);
8392       check_profitability = true;
8393     }
8394
8395   /* Make sure there exists a single-predecessor exit bb.  Do this before
8396      versioning.   */
8397   edge e = single_exit (loop);
8398   if (! single_pred_p (e->dest))
8399     {
8400       split_loop_exit_edge (e);
8401       if (dump_enabled_p ())
8402         dump_printf (MSG_NOTE, "split exit edge\n");
8403     }
8404
8405   /* Version the loop first, if required, so the profitability check
8406      comes first.  */
8407
8408   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8409     {
8410       poly_uint64 versioning_threshold
8411         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8412       if (check_profitability
8413           && ordered_p (poly_uint64 (th), versioning_threshold))
8414         {
8415           versioning_threshold = ordered_max (poly_uint64 (th),
8416                                               versioning_threshold);
8417           check_profitability = false;
8418         }
8419       vect_loop_versioning (loop_vinfo, th, check_profitability,
8420                             versioning_threshold);
8421       check_profitability = false;
8422     }
8423
8424   /* Make sure there exists a single-predecessor exit bb also on the
8425      scalar loop copy.  Do this after versioning but before peeling
8426      so CFG structure is fine for both scalar and if-converted loop
8427      to make slpeel_duplicate_current_defs_from_edges face matched
8428      loop closed PHI nodes on the exit.  */
8429   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8430     {
8431       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8432       if (! single_pred_p (e->dest))
8433         {
8434           split_loop_exit_edge (e);
8435           if (dump_enabled_p ())
8436             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8437         }
8438     }
8439
8440   tree niters = vect_build_loop_niters (loop_vinfo);
8441   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8442   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8443   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8444   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8445                               &step_vector, &niters_vector_mult_vf, th,
8446                               check_profitability, niters_no_overflow);
8447
8448   if (niters_vector == NULL_TREE)
8449     {
8450       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8451           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8452           && known_eq (lowest_vf, vf))
8453         {
8454           niters_vector
8455             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8456                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8457           step_vector = build_one_cst (TREE_TYPE (niters));
8458         }
8459       else
8460         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8461                                      &step_vector, niters_no_overflow);
8462     }
8463
8464   /* 1) Make sure the loop header has exactly two entries
8465      2) Make sure we have a preheader basic block.  */
8466
8467   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8468
8469   split_edge (loop_preheader_edge (loop));
8470
8471   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8472       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8473     /* This will deal with any possible peeling.  */
8474     vect_prepare_for_masked_peels (loop_vinfo);
8475
8476   /* FORNOW: the vectorizer supports only loops which body consist
8477      of one basic block (header + empty latch). When the vectorizer will
8478      support more involved loop forms, the order by which the BBs are
8479      traversed need to be reconsidered.  */
8480
8481   for (i = 0; i < nbbs; i++)
8482     {
8483       basic_block bb = bbs[i];
8484       stmt_vec_info stmt_info;
8485
8486       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8487            gsi_next (&si))
8488         {
8489           gphi *phi = si.phi ();
8490           if (dump_enabled_p ())
8491             {
8492               dump_printf_loc (MSG_NOTE, vect_location,
8493                                "------>vectorizing phi: ");
8494               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8495             }
8496           stmt_info = vinfo_for_stmt (phi);
8497           if (!stmt_info)
8498             continue;
8499
8500           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8501             vect_loop_kill_debug_uses (loop, phi);
8502
8503           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8504               && !STMT_VINFO_LIVE_P (stmt_info))
8505             continue;
8506
8507           if (STMT_VINFO_VECTYPE (stmt_info)
8508               && (maybe_ne
8509                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8510               && dump_enabled_p ())
8511             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8512
8513           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8514                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8515                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8516               && ! PURE_SLP_STMT (stmt_info))
8517             {
8518               if (dump_enabled_p ())
8519                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8520               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8521             }
8522         }
8523
8524       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8525            !gsi_end_p (si);)
8526         {
8527           stmt = gsi_stmt (si);
8528           /* During vectorization remove existing clobber stmts.  */
8529           if (gimple_clobber_p (stmt))
8530             {
8531               unlink_stmt_vdef (stmt);
8532               gsi_remove (&si, true);
8533               release_defs (stmt);
8534             }
8535           else
8536             {
8537               stmt_info = vinfo_for_stmt (stmt);
8538
8539               /* vector stmts created in the outer-loop during vectorization of
8540                  stmts in an inner-loop may not have a stmt_info, and do not
8541                  need to be vectorized.  */
8542               stmt_vec_info seen_store = NULL;
8543               if (stmt_info)
8544                 {
8545                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8546                     {
8547                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8548                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8549                            !gsi_end_p (subsi); gsi_next (&subsi))
8550                         vect_transform_loop_stmt (loop_vinfo,
8551                                                   gsi_stmt (subsi), &si,
8552                                                   &seen_store,
8553                                                   &slp_scheduled);
8554                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8555                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8556                                                 &seen_store, &slp_scheduled);
8557                     }
8558                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8559                                             &seen_store, &slp_scheduled);
8560                 }
8561               if (seen_store)
8562                 {
8563                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8564                     {
8565                       /* Interleaving.  If IS_STORE is TRUE, the
8566                          vectorization of the interleaving chain was
8567                          completed - free all the stores in the chain.  */
8568                       gsi_next (&si);
8569                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8570                     }
8571                   else
8572                     {
8573                       /* Free the attached stmt_vec_info and remove the
8574                          stmt.  */
8575                       free_stmt_vec_info (stmt);
8576                       unlink_stmt_vdef (stmt);
8577                       gsi_remove (&si, true);
8578                       release_defs (stmt);
8579                     }
8580                 }
8581               else
8582                 gsi_next (&si);
8583             }
8584         }
8585
8586       /* Stub out scalar statements that must not survive vectorization.
8587          Doing this here helps with grouped statements, or statements that
8588          are involved in patterns.  */
8589       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8590            !gsi_end_p (gsi); gsi_next (&gsi))
8591         {
8592           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8593           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8594             {
8595               tree lhs = gimple_get_lhs (call);
8596               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8597                 {
8598                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8599                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8600                   gsi_replace (&gsi, new_stmt, true);
8601                 }
8602             }
8603         }
8604     }                           /* BBs in loop */
8605
8606   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8607      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8608   if (integer_onep (step_vector))
8609     niters_no_overflow = true;
8610   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8611                            niters_vector_mult_vf, !niters_no_overflow);
8612
8613   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8614   scale_profile_for_vect_loop (loop, assumed_vf);
8615
8616   /* True if the final iteration might not handle a full vector's
8617      worth of scalar iterations.  */
8618   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8619   /* The minimum number of iterations performed by the epilogue.  This
8620      is 1 when peeling for gaps because we always need a final scalar
8621      iteration.  */
8622   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8623   /* +1 to convert latch counts to loop iteration counts,
8624      -min_epilogue_iters to remove iterations that cannot be performed
8625        by the vector code.  */
8626   int bias_for_lowest = 1 - min_epilogue_iters;
8627   int bias_for_assumed = bias_for_lowest;
8628   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8629   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8630     {
8631       /* When the amount of peeling is known at compile time, the first
8632          iteration will have exactly alignment_npeels active elements.
8633          In the worst case it will have at least one.  */
8634       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8635       bias_for_lowest += lowest_vf - min_first_active;
8636       bias_for_assumed += assumed_vf - min_first_active;
8637     }
8638   /* In these calculations the "- 1" converts loop iteration counts
8639      back to latch counts.  */
8640   if (loop->any_upper_bound)
8641     loop->nb_iterations_upper_bound
8642       = (final_iter_may_be_partial
8643          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8644                           lowest_vf) - 1
8645          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8646                            lowest_vf) - 1);
8647   if (loop->any_likely_upper_bound)
8648     loop->nb_iterations_likely_upper_bound
8649       = (final_iter_may_be_partial
8650          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8651                           + bias_for_lowest, lowest_vf) - 1
8652          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8653                            + bias_for_lowest, lowest_vf) - 1);
8654   if (loop->any_estimate)
8655     loop->nb_iterations_estimate
8656       = (final_iter_may_be_partial
8657          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8658                           assumed_vf) - 1
8659          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8660                            assumed_vf) - 1);
8661
8662   if (dump_enabled_p ())
8663     {
8664       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8665         {
8666           dump_printf_loc (MSG_NOTE, vect_location,
8667                            "LOOP VECTORIZED\n");
8668           if (loop->inner)
8669             dump_printf_loc (MSG_NOTE, vect_location,
8670                              "OUTER LOOP VECTORIZED\n");
8671           dump_printf (MSG_NOTE, "\n");
8672         }
8673       else
8674         {
8675           dump_printf_loc (MSG_NOTE, vect_location,
8676                            "LOOP EPILOGUE VECTORIZED (VS=");
8677           dump_dec (MSG_NOTE, current_vector_size);
8678           dump_printf (MSG_NOTE, ")\n");
8679         }
8680     }
8681
8682   /* Free SLP instances here because otherwise stmt reference counting
8683      won't work.  */
8684   slp_instance instance;
8685   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8686     vect_free_slp_instance (instance, true);
8687   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8688   /* Clear-up safelen field since its value is invalid after vectorization
8689      since vectorized loop can have loop-carried dependencies.  */
8690   loop->safelen = 0;
8691
8692   /* Don't vectorize epilogue for epilogue.  */
8693   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8694     epilogue = NULL;
8695
8696   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8697     epilogue = NULL;
8698
8699   if (epilogue)
8700     {
8701       auto_vector_sizes vector_sizes;
8702       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8703       unsigned int next_size = 0;
8704
8705       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8706           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8707           && known_eq (vf, lowest_vf))
8708         {
8709           unsigned int eiters
8710             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8711                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8712           eiters = eiters % lowest_vf;
8713           epilogue->nb_iterations_upper_bound = eiters - 1;
8714
8715           unsigned int ratio;
8716           while (next_size < vector_sizes.length ()
8717                  && !(constant_multiple_p (current_vector_size,
8718                                            vector_sizes[next_size], &ratio)
8719                       && eiters >= lowest_vf / ratio))
8720             next_size += 1;
8721         }
8722       else
8723         while (next_size < vector_sizes.length ()
8724                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8725           next_size += 1;
8726
8727       if (next_size == vector_sizes.length ())
8728         epilogue = NULL;
8729     }
8730
8731   if (epilogue)
8732     {
8733       epilogue->force_vectorize = loop->force_vectorize;
8734       epilogue->safelen = loop->safelen;
8735       epilogue->dont_vectorize = false;
8736
8737       /* We may need to if-convert epilogue to vectorize it.  */
8738       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8739         tree_if_conversion (epilogue);
8740     }
8741
8742   return epilogue;
8743 }
8744
8745 /* The code below is trying to perform simple optimization - revert
8746    if-conversion for masked stores, i.e. if the mask of a store is zero
8747    do not perform it and all stored value producers also if possible.
8748    For example,
8749      for (i=0; i<n; i++)
8750        if (c[i])
8751         {
8752           p1[i] += 1;
8753           p2[i] = p3[i] +2;
8754         }
8755    this transformation will produce the following semi-hammock:
8756
8757    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8758      {
8759        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8760        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8761        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8762        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8763        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8764        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8765      }
8766 */
8767
8768 void
8769 optimize_mask_stores (struct loop *loop)
8770 {
8771   basic_block *bbs = get_loop_body (loop);
8772   unsigned nbbs = loop->num_nodes;
8773   unsigned i;
8774   basic_block bb;
8775   struct loop *bb_loop;
8776   gimple_stmt_iterator gsi;
8777   gimple *stmt;
8778   auto_vec<gimple *> worklist;
8779
8780   vect_location = find_loop_location (loop);
8781   /* Pick up all masked stores in loop if any.  */
8782   for (i = 0; i < nbbs; i++)
8783     {
8784       bb = bbs[i];
8785       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8786            gsi_next (&gsi))
8787         {
8788           stmt = gsi_stmt (gsi);
8789           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8790             worklist.safe_push (stmt);
8791         }
8792     }
8793
8794   free (bbs);
8795   if (worklist.is_empty ())
8796     return;
8797
8798   /* Loop has masked stores.  */
8799   while (!worklist.is_empty ())
8800     {
8801       gimple *last, *last_store;
8802       edge e, efalse;
8803       tree mask;
8804       basic_block store_bb, join_bb;
8805       gimple_stmt_iterator gsi_to;
8806       tree vdef, new_vdef;
8807       gphi *phi;
8808       tree vectype;
8809       tree zero;
8810
8811       last = worklist.pop ();
8812       mask = gimple_call_arg (last, 2);
8813       bb = gimple_bb (last);
8814       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8815          the same loop as if_bb.  It could be different to LOOP when two
8816          level loop-nest is vectorized and mask_store belongs to the inner
8817          one.  */
8818       e = split_block (bb, last);
8819       bb_loop = bb->loop_father;
8820       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8821       join_bb = e->dest;
8822       store_bb = create_empty_bb (bb);
8823       add_bb_to_loop (store_bb, bb_loop);
8824       e->flags = EDGE_TRUE_VALUE;
8825       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8826       /* Put STORE_BB to likely part.  */
8827       efalse->probability = profile_probability::unlikely ();
8828       store_bb->count = efalse->count ();
8829       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8830       if (dom_info_available_p (CDI_DOMINATORS))
8831         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8832       if (dump_enabled_p ())
8833         dump_printf_loc (MSG_NOTE, vect_location,
8834                          "Create new block %d to sink mask stores.",
8835                          store_bb->index);
8836       /* Create vector comparison with boolean result.  */
8837       vectype = TREE_TYPE (mask);
8838       zero = build_zero_cst (vectype);
8839       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8840       gsi = gsi_last_bb (bb);
8841       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8842       /* Create new PHI node for vdef of the last masked store:
8843          .MEM_2 = VDEF <.MEM_1>
8844          will be converted to
8845          .MEM.3 = VDEF <.MEM_1>
8846          and new PHI node will be created in join bb
8847          .MEM_2 = PHI <.MEM_1, .MEM_3>
8848       */
8849       vdef = gimple_vdef (last);
8850       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8851       gimple_set_vdef (last, new_vdef);
8852       phi = create_phi_node (vdef, join_bb);
8853       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8854
8855       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8856       while (true)
8857         {
8858           gimple_stmt_iterator gsi_from;
8859           gimple *stmt1 = NULL;
8860
8861           /* Move masked store to STORE_BB.  */
8862           last_store = last;
8863           gsi = gsi_for_stmt (last);
8864           gsi_from = gsi;
8865           /* Shift GSI to the previous stmt for further traversal.  */
8866           gsi_prev (&gsi);
8867           gsi_to = gsi_start_bb (store_bb);
8868           gsi_move_before (&gsi_from, &gsi_to);
8869           /* Setup GSI_TO to the non-empty block start.  */
8870           gsi_to = gsi_start_bb (store_bb);
8871           if (dump_enabled_p ())
8872             {
8873               dump_printf_loc (MSG_NOTE, vect_location,
8874                                "Move stmt to created bb\n");
8875               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8876             }
8877           /* Move all stored value producers if possible.  */
8878           while (!gsi_end_p (gsi))
8879             {
8880               tree lhs;
8881               imm_use_iterator imm_iter;
8882               use_operand_p use_p;
8883               bool res;
8884
8885               /* Skip debug statements.  */
8886               if (is_gimple_debug (gsi_stmt (gsi)))
8887                 {
8888                   gsi_prev (&gsi);
8889                   continue;
8890                 }
8891               stmt1 = gsi_stmt (gsi);
8892               /* Do not consider statements writing to memory or having
8893                  volatile operand.  */
8894               if (gimple_vdef (stmt1)
8895                   || gimple_has_volatile_ops (stmt1))
8896                 break;
8897               gsi_from = gsi;
8898               gsi_prev (&gsi);
8899               lhs = gimple_get_lhs (stmt1);
8900               if (!lhs)
8901                 break;
8902
8903               /* LHS of vectorized stmt must be SSA_NAME.  */
8904               if (TREE_CODE (lhs) != SSA_NAME)
8905                 break;
8906
8907               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8908                 {
8909                   /* Remove dead scalar statement.  */
8910                   if (has_zero_uses (lhs))
8911                     {
8912                       gsi_remove (&gsi_from, true);
8913                       continue;
8914                     }
8915                 }
8916
8917               /* Check that LHS does not have uses outside of STORE_BB.  */
8918               res = true;
8919               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8920                 {
8921                   gimple *use_stmt;
8922                   use_stmt = USE_STMT (use_p);
8923                   if (is_gimple_debug (use_stmt))
8924                     continue;
8925                   if (gimple_bb (use_stmt) != store_bb)
8926                     {
8927                       res = false;
8928                       break;
8929                     }
8930                 }
8931               if (!res)
8932                 break;
8933
8934               if (gimple_vuse (stmt1)
8935                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8936                 break;
8937
8938               /* Can move STMT1 to STORE_BB.  */
8939               if (dump_enabled_p ())
8940                 {
8941                   dump_printf_loc (MSG_NOTE, vect_location,
8942                                    "Move stmt to created bb\n");
8943                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8944                 }
8945               gsi_move_before (&gsi_from, &gsi_to);
8946               /* Shift GSI_TO for further insertion.  */
8947               gsi_prev (&gsi_to);
8948             }
8949           /* Put other masked stores with the same mask to STORE_BB.  */
8950           if (worklist.is_empty ()
8951               || gimple_call_arg (worklist.last (), 2) != mask
8952               || worklist.last () != stmt1)
8953             break;
8954           last = worklist.pop ();
8955         }
8956       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8957     }
8958 }