gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   if (dump_enabled_p ())
 217     {
 218       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 219       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 220     }
 221   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 222     return false;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 228
 229       /* If a pattern statement has def stmts, analyze them too.  */
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             {
 237               dump_printf_loc (MSG_NOTE, vect_location,
 238                                "==> examining pattern def stmt: ");
 239               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 240                                 def_stmt_info->stmt, 0);
 241             }
 242           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 243                                              vf, mask_producers))
 244             return false;
 245         }
 246
 247       if (dump_enabled_p ())
 248         {
 249           dump_printf_loc (MSG_NOTE, vect_location,
 250                            "==> examining pattern statement: ");
 251           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 252         }
 253       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 254         return false;
 255     }
 256
 257   return true;
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static bool
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297   auto_vec<stmt_vec_info> mask_producers;
 298
 299   if (dump_enabled_p ())
 300     dump_printf_loc (MSG_NOTE, vect_location,
 301                      "=== vect_determine_vectorization_factor ===\n");
 302
 303   for (i = 0; i < nbbs; i++)
 304     {
 305       basic_block bb = bbs[i];
 306
 307       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 308            gsi_next (&si))
 309         {
 310           phi = si.phi ();
 311           stmt_info = vinfo_for_stmt (phi);
 312           if (dump_enabled_p ())
 313             {
 314               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 315               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 316             }
 317
 318           gcc_assert (stmt_info);
 319
 320           if (STMT_VINFO_RELEVANT_P (stmt_info)
 321               || STMT_VINFO_LIVE_P (stmt_info))
 322             {
 323               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 324               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 325
 326               if (dump_enabled_p ())
 327                 {
 328                   dump_printf_loc (MSG_NOTE, vect_location,
 329                                    "get vectype for scalar type:  ");
 330                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 331                   dump_printf (MSG_NOTE, "\n");
 332                 }
 333
 334               vectype = get_vectype_for_scalar_type (scalar_type);
 335               if (!vectype)
 336                 {
 337                   if (dump_enabled_p ())
 338                     {
 339                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 340                                        "not vectorized: unsupported "
 341                                        "data-type ");
 342                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 343                                          scalar_type);
 344                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 345                     }
 346                   return false;
 347                 }
 348               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 349
 350               if (dump_enabled_p ())
 351                 {
 352                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 353                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 354                   dump_printf (MSG_NOTE, "\n");
 355                 }
 356
 357               if (dump_enabled_p ())
 358                 {
 359                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 360                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 361                   dump_printf (MSG_NOTE, "\n");
 362                 }
 363
 364               vect_update_max_nunits (&vectorization_factor, vectype);
 365             }
 366         }
 367
 368       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 369            gsi_next (&si))
 370         {
 371           stmt_info = vinfo_for_stmt (gsi_stmt (si));
 372           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 373                                            &mask_producers))
 374             return false;
 375         }
 376     }
 377
 378   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 379   if (dump_enabled_p ())
 380     {
 381       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 382       dump_dec (MSG_NOTE, vectorization_factor);
 383       dump_printf (MSG_NOTE, "\n");
 384     }
 385
 386   if (known_le (vectorization_factor, 1U))
 387     {
 388       if (dump_enabled_p ())
 389         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 390                          "not vectorized: unsupported data-type\n");
 391       return false;
 392     }
 393   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 394
 395   for (i = 0; i < mask_producers.length (); i++)
 396     {
 397       stmt_info = mask_producers[i];
 398       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 399       if (!mask_type)
 400         return false;
 401       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 402     }
 403
 404   return true;
 405 }
 406
 407
 408 /* Function vect_is_simple_iv_evolution.
 409
 410    FORNOW: A simple evolution of an induction variables in the loop is
 411    considered a polynomial evolution.  */
 412
 413 static bool
 414 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 415                              tree * step)
 416 {
 417   tree init_expr;
 418   tree step_expr;
 419   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 420   basic_block bb;
 421
 422   /* When there is no evolution in this loop, the evolution function
 423      is not "simple".  */
 424   if (evolution_part == NULL_TREE)
 425     return false;
 426
 427   /* When the evolution is a polynomial of degree >= 2
 428      the evolution function is not "simple".  */
 429   if (tree_is_chrec (evolution_part))
 430     return false;
 431
 432   step_expr = evolution_part;
 433   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 434
 435   if (dump_enabled_p ())
 436     {
 437       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 438       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 439       dump_printf (MSG_NOTE, ",  init: ");
 440       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 441       dump_printf (MSG_NOTE, "\n");
 442     }
 443
 444   *init = init_expr;
 445   *step = step_expr;
 446
 447   if (TREE_CODE (step_expr) != INTEGER_CST
 448       && (TREE_CODE (step_expr) != SSA_NAME
 449           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 450               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 451           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 452               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 453                   || !flag_associative_math)))
 454       && (TREE_CODE (step_expr) != REAL_CST
 455           || !flag_associative_math))
 456     {
 457       if (dump_enabled_p ())
 458         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 459                          "step unknown.\n");
 460       return false;
 461     }
 462
 463   return true;
 464 }
 465
 466 /* Function vect_analyze_scalar_cycles_1.
 467
 468    Examine the cross iteration def-use cycles of scalar variables
 469    in LOOP.  LOOP_VINFO represents the loop that is now being
 470    considered for vectorization (can be LOOP, or an outer-loop
 471    enclosing LOOP).  */
 472
 473 static void
 474 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 475 {
 476   basic_block bb = loop->header;
 477   tree init, step;
 478   auto_vec<gimple *, 64> worklist;
 479   gphi_iterator gsi;
 480   bool double_reduc;
 481
 482   if (dump_enabled_p ())
 483     dump_printf_loc (MSG_NOTE, vect_location,
 484                      "=== vect_analyze_scalar_cycles ===\n");
 485
 486   /* First - identify all inductions.  Reduction detection assumes that all the
 487      inductions have been identified, therefore, this order must not be
 488      changed.  */
 489   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 490     {
 491       gphi *phi = gsi.phi ();
 492       tree access_fn = NULL;
 493       tree def = PHI_RESULT (phi);
 494       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 495
 496       if (dump_enabled_p ())
 497         {
 498           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 499           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 500         }
 501
 502       /* Skip virtual phi's.  The data dependences that are associated with
 503          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 504       if (virtual_operand_p (def))
 505         continue;
 506
 507       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 508
 509       /* Analyze the evolution function.  */
 510       access_fn = analyze_scalar_evolution (loop, def);
 511       if (access_fn)
 512         {
 513           STRIP_NOPS (access_fn);
 514           if (dump_enabled_p ())
 515             {
 516               dump_printf_loc (MSG_NOTE, vect_location,
 517                                "Access function of PHI: ");
 518               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 519               dump_printf (MSG_NOTE, "\n");
 520             }
 521           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522             = initial_condition_in_loop_num (access_fn, loop->num);
 523           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 524             = evolution_part_in_loop_num (access_fn, loop->num);
 525         }
 526
 527       if (!access_fn
 528           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 529           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 530               && TREE_CODE (step) != INTEGER_CST))
 531         {
 532           worklist.safe_push (phi);
 533           continue;
 534         }
 535
 536       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 537                   != NULL_TREE);
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 539
 540       if (dump_enabled_p ())
 541         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 542       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 543     }
 544
 545
 546   /* Second - identify all reductions and nested cycles.  */
 547   while (worklist.length () > 0)
 548     {
 549       gimple *phi = worklist.pop ();
 550       tree def = PHI_RESULT (phi);
 551       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 552       gimple *reduc_stmt;
 553
 554       if (dump_enabled_p ())
 555         {
 556           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 557           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 558         }
 559
 560       gcc_assert (!virtual_operand_p (def)
 561                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 562
 563       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 564                                                 &double_reduc, false);
 565       if (reduc_stmt)
 566         {
 567           if (double_reduc)
 568             {
 569               if (dump_enabled_p ())
 570                 dump_printf_loc (MSG_NOTE, vect_location,
 571                                  "Detected double reduction.\n");
 572
 573               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 574               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 575                                                     vect_double_reduction_def;
 576             }
 577           else
 578             {
 579               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 580                 {
 581                   if (dump_enabled_p ())
 582                     dump_printf_loc (MSG_NOTE, vect_location,
 583                                      "Detected vectorizable nested cycle.\n");
 584
 585                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 586                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 587                                                              vect_nested_cycle;
 588                 }
 589               else
 590                 {
 591                   if (dump_enabled_p ())
 592                     dump_printf_loc (MSG_NOTE, vect_location,
 593                                      "Detected reduction.\n");
 594
 595                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 596                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 597                                                            vect_reduction_def;
 598                   /* Store the reduction cycles for possible vectorization in
 599                      loop-aware SLP if it was not detected as reduction
 600                      chain.  */
 601                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 602                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 603                 }
 604             }
 605         }
 606       else
 607         if (dump_enabled_p ())
 608           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 609                            "Unknown def-use cycle pattern.\n");
 610     }
 611 }
 612
 613
 614 /* Function vect_analyze_scalar_cycles.
 615
 616    Examine the cross iteration def-use cycles of scalar variables, by
 617    analyzing the loop-header PHIs of scalar variables.  Classify each
 618    cycle as one of the following: invariant, induction, reduction, unknown.
 619    We do that for the loop represented by LOOP_VINFO, and also to its
 620    inner-loop, if exists.
 621    Examples for scalar cycles:
 622
 623    Example1: reduction:
 624
 625               loop1:
 626               for (i=0; i<N; i++)
 627                  sum += a[i];
 628
 629    Example2: induction:
 630
 631               loop2:
 632               for (i=0; i<N; i++)
 633                  a[i] = i;  */
 634
 635 static void
 636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 637 {
 638   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 639
 640   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 641
 642   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 643      Reductions in such inner-loop therefore have different properties than
 644      the reductions in the nest that gets vectorized:
 645      1. When vectorized, they are executed in the same order as in the original
 646         scalar loop, so we can't change the order of computation when
 647         vectorizing them.
 648      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 649         current checks are too strict.  */
 650
 651   if (loop->inner)
 652     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 653 }
 654
 655 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 656
 657 static void
 658 vect_fixup_reduc_chain (gimple *stmt)
 659 {
 660   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 661   gimple *stmtp;
 662   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 663               && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 664   REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
 665     = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
 666   do
 667     {
 668       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 669       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 670       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 671       if (stmt)
 672         REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 673           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 674     }
 675   while (stmt);
 676   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 677 }
 678
 679 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 680
 681 static void
 682 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 683 {
 684   gimple *first;
 685   unsigned i;
 686
 687   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 688     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 689       {
 690         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 691         while (next)
 692           {
 693             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 694               break;
 695             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 696           }
 697         /* If not all stmt in the chain are patterns try to handle
 698            the chain without patterns.  */
 699         if (! next)
 700           {
 701             vect_fixup_reduc_chain (first);
 702             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 703               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 704           }
 705       }
 706 }
 707
 708 /* Function vect_get_loop_niters.
 709
 710    Determine how many iterations the loop is executed and place it
 711    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 712    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 713    niter information holds in ASSUMPTIONS.
 714
 715    Return the loop exit condition.  */
 716
 717
 718 static gcond *
 719 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 720                       tree *number_of_iterations, tree *number_of_iterationsm1)
 721 {
 722   edge exit = single_exit (loop);
 723   struct tree_niter_desc niter_desc;
 724   tree niter_assumptions, niter, may_be_zero;
 725   gcond *cond = get_loop_exit_condition (loop);
 726
 727   *assumptions = boolean_true_node;
 728   *number_of_iterationsm1 = chrec_dont_know;
 729   *number_of_iterations = chrec_dont_know;
 730   if (dump_enabled_p ())
 731     dump_printf_loc (MSG_NOTE, vect_location,
 732                      "=== get_loop_niters ===\n");
 733
 734   if (!exit)
 735     return cond;
 736
 737   niter = chrec_dont_know;
 738   may_be_zero = NULL_TREE;
 739   niter_assumptions = boolean_true_node;
 740   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 741       || chrec_contains_undetermined (niter_desc.niter))
 742     return cond;
 743
 744   niter_assumptions = niter_desc.assumptions;
 745   may_be_zero = niter_desc.may_be_zero;
 746   niter = niter_desc.niter;
 747
 748   if (may_be_zero && integer_zerop (may_be_zero))
 749     may_be_zero = NULL_TREE;
 750
 751   if (may_be_zero)
 752     {
 753       if (COMPARISON_CLASS_P (may_be_zero))
 754         {
 755           /* Try to combine may_be_zero with assumptions, this can simplify
 756              computation of niter expression.  */
 757           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 758             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 759                                              niter_assumptions,
 760                                              fold_build1 (TRUTH_NOT_EXPR,
 761                                                           boolean_type_node,
 762                                                           may_be_zero));
 763           else
 764             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 765                                  build_int_cst (TREE_TYPE (niter), 0),
 766                                  rewrite_to_non_trapping_overflow (niter));
 767
 768           may_be_zero = NULL_TREE;
 769         }
 770       else if (integer_nonzerop (may_be_zero))
 771         {
 772           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 773           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 774           return cond;
 775         }
 776       else
 777         return cond;
 778     }
 779
 780   *assumptions = niter_assumptions;
 781   *number_of_iterationsm1 = niter;
 782
 783   /* We want the number of loop header executions which is the number
 784      of latch executions plus one.
 785      ???  For UINT_MAX latch executions this number overflows to zero
 786      for loops like do { n++; } while (n != 0);  */
 787   if (niter && !chrec_contains_undetermined (niter))
 788     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 789                           build_int_cst (TREE_TYPE (niter), 1));
 790   *number_of_iterations = niter;
 791
 792   return cond;
 793 }
 794
 795 /* Function bb_in_loop_p
 796
 797    Used as predicate for dfs order traversal of the loop bbs.  */
 798
 799 static bool
 800 bb_in_loop_p (const_basic_block bb, const void *data)
 801 {
 802   const struct loop *const loop = (const struct loop *)data;
 803   if (flow_bb_inside_loop_p (loop, bb))
 804     return true;
 805   return false;
 806 }
 807
 808
 809 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 810    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 811
 812 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
 813   : vec_info (vec_info::loop, init_cost (loop_in)),
 814     loop (loop_in),
 815     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 816     num_itersm1 (NULL_TREE),
 817     num_iters (NULL_TREE),
 818     num_iters_unchanged (NULL_TREE),
 819     num_iters_assumptions (NULL_TREE),
 820     th (0),
 821     versioning_threshold (0),
 822     vectorization_factor (0),
 823     max_vectorization_factor (0),
 824     mask_skip_niters (NULL_TREE),
 825     mask_compare_type (NULL_TREE),
 826     unaligned_dr (NULL),
 827     peeling_for_alignment (0),
 828     ptr_mask (0),
 829     ivexpr_map (NULL),
 830     slp_unrolling_factor (1),
 831     single_scalar_iteration_cost (0),
 832     vectorizable (false),
 833     can_fully_mask_p (true),
 834     fully_masked_p (false),
 835     peeling_for_gaps (false),
 836     peeling_for_niter (false),
 837     operands_swapped (false),
 838     no_data_dependencies (false),
 839     has_mask_store (false),
 840     scalar_loop (NULL),
 841     orig_loop_info (NULL)
 842 {
 843   /* Create/Update stmt_info for all stmts in the loop.  */
 844   basic_block *body = get_loop_body (loop);
 845   for (unsigned int i = 0; i < loop->num_nodes; i++)
 846     {
 847       basic_block bb = body[i];
 848       gimple_stmt_iterator si;
 849
 850       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 851         {
 852           gimple *phi = gsi_stmt (si);
 853           gimple_set_uid (phi, 0);
 854           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
 855         }
 856
 857       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 858         {
 859           gimple *stmt = gsi_stmt (si);
 860           gimple_set_uid (stmt, 0);
 861           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
 862         }
 863     }
 864   free (body);
 865
 866   /* CHECKME: We want to visit all BBs before their successors (except for
 867      latch blocks, for which this assertion wouldn't hold).  In the simple
 868      case of the loop forms we allow, a dfs order of the BBs would the same
 869      as reversed postorder traversal, so we are safe.  */
 870
 871   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 872                                           bbs, loop->num_nodes, loop);
 873   gcc_assert (nbbs == loop->num_nodes);
 874 }
 875
 876 /* Free all levels of MASKS.  */
 877
 878 void
 879 release_vec_loop_masks (vec_loop_masks *masks)
 880 {
 881   rgroup_masks *rgm;
 882   unsigned int i;
 883   FOR_EACH_VEC_ELT (*masks, i, rgm)
 884     rgm->masks.release ();
 885   masks->release ();
 886 }
 887
 888 /* Free all memory used by the _loop_vec_info, as well as all the
 889    stmt_vec_info structs of all the stmts in the loop.  */
 890
 891 _loop_vec_info::~_loop_vec_info ()
 892 {
 893   int nbbs;
 894   gimple_stmt_iterator si;
 895   int j;
 896
 897   /* ???  We're releasing loop_vinfos en-block.  */
 898   set_stmt_vec_info_vec (&stmt_vec_infos);
 899   nbbs = loop->num_nodes;
 900   for (j = 0; j < nbbs; j++)
 901     {
 902       basic_block bb = bbs[j];
 903       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 904         free_stmt_vec_info (gsi_stmt (si));
 905
 906       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 907         {
 908           gimple *stmt = gsi_stmt (si);
 909
 910           /* We may have broken canonical form by moving a constant
 911              into RHS1 of a commutative op.  Fix such occurrences.  */
 912           if (operands_swapped && is_gimple_assign (stmt))
 913             {
 914               enum tree_code code = gimple_assign_rhs_code (stmt);
 915
 916               if ((code == PLUS_EXPR
 917                    || code == POINTER_PLUS_EXPR
 918                    || code == MULT_EXPR)
 919                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 920                 swap_ssa_operands (stmt,
 921                                    gimple_assign_rhs1_ptr (stmt),
 922                                    gimple_assign_rhs2_ptr (stmt));
 923               else if (code == COND_EXPR
 924                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 925                 {
 926                   tree cond_expr = gimple_assign_rhs1 (stmt);
 927                   enum tree_code cond_code = TREE_CODE (cond_expr);
 928
 929                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 930                     {
 931                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 932                                                                   0));
 933                       cond_code = invert_tree_comparison (cond_code,
 934                                                           honor_nans);
 935                       if (cond_code != ERROR_MARK)
 936                         {
 937                           TREE_SET_CODE (cond_expr, cond_code);
 938                           swap_ssa_operands (stmt,
 939                                              gimple_assign_rhs2_ptr (stmt),
 940                                              gimple_assign_rhs3_ptr (stmt));
 941                         }
 942                     }
 943                 }
 944             }
 945
 946           /* Free stmt_vec_info.  */
 947           free_stmt_vec_info (stmt);
 948           gsi_next (&si);
 949         }
 950     }
 951
 952   free (bbs);
 953
 954   release_vec_loop_masks (&masks);
 955   delete ivexpr_map;
 956
 957   loop->aux = NULL;
 958 }
 959
 960 /* Return an invariant or register for EXPR and emit necessary
 961    computations in the LOOP_VINFO loop preheader.  */
 962
 963 tree
 964 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 965 {
 966   if (is_gimple_reg (expr)
 967       || is_gimple_min_invariant (expr))
 968     return expr;
 969
 970   if (! loop_vinfo->ivexpr_map)
 971     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 972   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 973   if (! cached)
 974     {
 975       gimple_seq stmts = NULL;
 976       cached = force_gimple_operand (unshare_expr (expr),
 977                                      &stmts, true, NULL_TREE);
 978       if (stmts)
 979         {
 980           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 981           gsi_insert_seq_on_edge_immediate (e, stmts);
 982         }
 983     }
 984   return cached;
 985 }
 986
 987 /* Return true if we can use CMP_TYPE as the comparison type to produce
 988    all masks required to mask LOOP_VINFO.  */
 989
 990 static bool
 991 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 992 {
 993   rgroup_masks *rgm;
 994   unsigned int i;
 995   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 996     if (rgm->mask_type != NULL_TREE
 997         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 998                                             cmp_type, rgm->mask_type,
 999                                             OPTIMIZE_FOR_SPEED))
1000       return false;
1001   return true;
1002 }
1003
1004 /* Calculate the maximum number of scalars per iteration for every
1005    rgroup in LOOP_VINFO.  */
1006
1007 static unsigned int
1008 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1009 {
1010   unsigned int res = 1;
1011   unsigned int i;
1012   rgroup_masks *rgm;
1013   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1014     res = MAX (res, rgm->max_nscalars_per_iter);
1015   return res;
1016 }
1017
1018 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1019    whether we can actually generate the masks required.  Return true if so,
1020    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1021
1022 static bool
1023 vect_verify_full_masking (loop_vec_info loop_vinfo)
1024 {
1025   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1026   unsigned int min_ni_width;
1027
1028   /* Use a normal loop if there are no statements that need masking.
1029      This only happens in rare degenerate cases: it means that the loop
1030      has no loads, no stores, and no live-out values.  */
1031   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1032     return false;
1033
1034   /* Get the maximum number of iterations that is representable
1035      in the counter type.  */
1036   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1037   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1038
1039   /* Get a more refined estimate for the number of iterations.  */
1040   widest_int max_back_edges;
1041   if (max_loop_iterations (loop, &max_back_edges))
1042     max_ni = wi::smin (max_ni, max_back_edges + 1);
1043
1044   /* Account for rgroup masks, in which each bit is replicated N times.  */
1045   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1046
1047   /* Work out how many bits we need to represent the limit.  */
1048   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1049
1050   /* Find a scalar mode for which WHILE_ULT is supported.  */
1051   opt_scalar_int_mode cmp_mode_iter;
1052   tree cmp_type = NULL_TREE;
1053   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1054     {
1055       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1056       if (cmp_bits >= min_ni_width
1057           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1058         {
1059           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1060           if (this_type
1061               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1062             {
1063               /* Although we could stop as soon as we find a valid mode,
1064                  it's often better to continue until we hit Pmode, since the
1065                  operands to the WHILE are more likely to be reusable in
1066                  address calculations.  */
1067               cmp_type = this_type;
1068               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1069                 break;
1070             }
1071         }
1072     }
1073
1074   if (!cmp_type)
1075     return false;
1076
1077   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1078   return true;
1079 }
1080
1081 /* Calculate the cost of one scalar iteration of the loop.  */
1082 static void
1083 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1084 {
1085   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1086   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1087   int nbbs = loop->num_nodes, factor;
1088   int innerloop_iters, i;
1089
1090   /* Gather costs for statements in the scalar loop.  */
1091
1092   /* FORNOW.  */
1093   innerloop_iters = 1;
1094   if (loop->inner)
1095     innerloop_iters = 50; /* FIXME */
1096
1097   for (i = 0; i < nbbs; i++)
1098     {
1099       gimple_stmt_iterator si;
1100       basic_block bb = bbs[i];
1101
1102       if (bb->loop_father == loop->inner)
1103         factor = innerloop_iters;
1104       else
1105         factor = 1;
1106
1107       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1108         {
1109           gimple *stmt = gsi_stmt (si);
1110           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1111
1112           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1113             continue;
1114
1115           /* Skip stmts that are not vectorized inside the loop.  */
1116           if (stmt_info
1117               && !STMT_VINFO_RELEVANT_P (stmt_info)
1118               && (!STMT_VINFO_LIVE_P (stmt_info)
1119                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1120               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1121             continue;
1122
1123           vect_cost_for_stmt kind;
1124           if (STMT_VINFO_DATA_REF (stmt_info))
1125             {
1126               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1127                kind = scalar_load;
1128              else
1129                kind = scalar_store;
1130             }
1131           else
1132             kind = scalar_stmt;
1133
1134           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1135                             factor, kind, stmt_info, 0, vect_prologue);
1136         }
1137     }
1138
1139   /* Now accumulate cost.  */
1140   void *target_cost_data = init_cost (loop);
1141   stmt_info_for_cost *si;
1142   int j;
1143   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1144                     j, si)
1145     {
1146       struct _stmt_vec_info *stmt_info
1147         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1148       (void) add_stmt_cost (target_cost_data, si->count,
1149                             si->kind, stmt_info, si->misalign,
1150                             vect_body);
1151     }
1152   unsigned dummy, body_cost = 0;
1153   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1154   destroy_cost_data (target_cost_data);
1155   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1156 }
1157
1158
1159 /* Function vect_analyze_loop_form_1.
1160
1161    Verify that certain CFG restrictions hold, including:
1162    - the loop has a pre-header
1163    - the loop has a single entry and exit
1164    - the loop exit condition is simple enough
1165    - the number of iterations can be analyzed, i.e, a countable loop.  The
1166      niter could be analyzed under some assumptions.  */
1167
1168 bool
1169 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1170                           tree *assumptions, tree *number_of_iterationsm1,
1171                           tree *number_of_iterations, gcond **inner_loop_cond)
1172 {
1173   if (dump_enabled_p ())
1174     dump_printf_loc (MSG_NOTE, vect_location,
1175                      "=== vect_analyze_loop_form ===\n");
1176
1177   /* Different restrictions apply when we are considering an inner-most loop,
1178      vs. an outer (nested) loop.
1179      (FORNOW. May want to relax some of these restrictions in the future).  */
1180
1181   if (!loop->inner)
1182     {
1183       /* Inner-most loop.  We currently require that the number of BBs is
1184          exactly 2 (the header and latch).  Vectorizable inner-most loops
1185          look like this:
1186
1187                         (pre-header)
1188                            |
1189                           header <--------+
1190                            | |            |
1191                            | +--> latch --+
1192                            |
1193                         (exit-bb)  */
1194
1195       if (loop->num_nodes != 2)
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: control flow in loop.\n");
1200           return false;
1201         }
1202
1203       if (empty_block_p (loop->header))
1204         {
1205           if (dump_enabled_p ())
1206             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207                              "not vectorized: empty loop.\n");
1208           return false;
1209         }
1210     }
1211   else
1212     {
1213       struct loop *innerloop = loop->inner;
1214       edge entryedge;
1215
1216       /* Nested loop. We currently require that the loop is doubly-nested,
1217          contains a single inner loop, and the number of BBs is exactly 5.
1218          Vectorizable outer-loops look like this:
1219
1220                         (pre-header)
1221                            |
1222                           header <---+
1223                            |         |
1224                           inner-loop |
1225                            |         |
1226                           tail ------+
1227                            |
1228                         (exit-bb)
1229
1230          The inner-loop has the properties expected of inner-most loops
1231          as described above.  */
1232
1233       if ((loop->inner)->inner || (loop->inner)->next)
1234         {
1235           if (dump_enabled_p ())
1236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1237                              "not vectorized: multiple nested loops.\n");
1238           return false;
1239         }
1240
1241       if (loop->num_nodes != 5)
1242         {
1243           if (dump_enabled_p ())
1244             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1245                              "not vectorized: control flow in loop.\n");
1246           return false;
1247         }
1248
1249       entryedge = loop_preheader_edge (innerloop);
1250       if (entryedge->src != loop->header
1251           || !single_exit (innerloop)
1252           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1253         {
1254           if (dump_enabled_p ())
1255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256                              "not vectorized: unsupported outerloop form.\n");
1257           return false;
1258         }
1259
1260       /* Analyze the inner-loop.  */
1261       tree inner_niterm1, inner_niter, inner_assumptions;
1262       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1263                                       &inner_assumptions, &inner_niterm1,
1264                                       &inner_niter, NULL)
1265           /* Don't support analyzing niter under assumptions for inner
1266              loop.  */
1267           || !integer_onep (inner_assumptions))
1268         {
1269           if (dump_enabled_p ())
1270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                              "not vectorized: Bad inner loop.\n");
1272           return false;
1273         }
1274
1275       if (!expr_invariant_in_loop_p (loop, inner_niter))
1276         {
1277           if (dump_enabled_p ())
1278             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1279                              "not vectorized: inner-loop count not"
1280                              " invariant.\n");
1281           return false;
1282         }
1283
1284       if (dump_enabled_p ())
1285         dump_printf_loc (MSG_NOTE, vect_location,
1286                          "Considering outer-loop vectorization.\n");
1287     }
1288
1289   if (!single_exit (loop)
1290       || EDGE_COUNT (loop->header->preds) != 2)
1291     {
1292       if (dump_enabled_p ())
1293         {
1294           if (!single_exit (loop))
1295             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1296                              "not vectorized: multiple exits.\n");
1297           else if (EDGE_COUNT (loop->header->preds) != 2)
1298             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1299                              "not vectorized: too many incoming edges.\n");
1300         }
1301       return false;
1302     }
1303
1304   /* We assume that the loop exit condition is at the end of the loop. i.e,
1305      that the loop is represented as a do-while (with a proper if-guard
1306      before the loop if needed), where the loop header contains all the
1307      executable statements, and the latch is empty.  */
1308   if (!empty_block_p (loop->latch)
1309       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1310     {
1311       if (dump_enabled_p ())
1312         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313                          "not vectorized: latch block not empty.\n");
1314       return false;
1315     }
1316
1317   /* Make sure the exit is not abnormal.  */
1318   edge e = single_exit (loop);
1319   if (e->flags & EDGE_ABNORMAL)
1320     {
1321       if (dump_enabled_p ())
1322         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1323                          "not vectorized: abnormal loop exit edge.\n");
1324       return false;
1325     }
1326
1327   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1328                                      number_of_iterationsm1);
1329   if (!*loop_cond)
1330     {
1331       if (dump_enabled_p ())
1332         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1333                          "not vectorized: complicated exit condition.\n");
1334       return false;
1335     }
1336
1337   if (integer_zerop (*assumptions)
1338       || !*number_of_iterations
1339       || chrec_contains_undetermined (*number_of_iterations))
1340     {
1341       if (dump_enabled_p ())
1342         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343                          "not vectorized: number of iterations cannot be "
1344                          "computed.\n");
1345       return false;
1346     }
1347
1348   if (integer_zerop (*number_of_iterations))
1349     {
1350       if (dump_enabled_p ())
1351         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1352                          "not vectorized: number of iterations = 0.\n");
1353       return false;
1354     }
1355
1356   return true;
1357 }
1358
1359 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1360
1361 loop_vec_info
1362 vect_analyze_loop_form (struct loop *loop)
1363 {
1364   tree assumptions, number_of_iterations, number_of_iterationsm1;
1365   gcond *loop_cond, *inner_loop_cond = NULL;
1366
1367   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1368                                   &assumptions, &number_of_iterationsm1,
1369                                   &number_of_iterations, &inner_loop_cond))
1370     return NULL;
1371
1372   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1373   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1374   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1375   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1376   if (!integer_onep (assumptions))
1377     {
1378       /* We consider to vectorize this loop by versioning it under
1379          some assumptions.  In order to do this, we need to clear
1380          existing information computed by scev and niter analyzer.  */
1381       scev_reset_htab ();
1382       free_numbers_of_iterations_estimates (loop);
1383       /* Also set flag for this loop so that following scev and niter
1384          analysis are done under the assumptions.  */
1385       loop_constraint_set (loop, LOOP_C_FINITE);
1386       /* Also record the assumptions for versioning.  */
1387       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1388     }
1389
1390   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1391     {
1392       if (dump_enabled_p ())
1393         {
1394           dump_printf_loc (MSG_NOTE, vect_location,
1395                            "Symbolic number of iterations is ");
1396           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1397           dump_printf (MSG_NOTE, "\n");
1398         }
1399     }
1400
1401   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1402   if (inner_loop_cond)
1403     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1404       = loop_exit_ctrl_vec_info_type;
1405
1406   gcc_assert (!loop->aux);
1407   loop->aux = loop_vinfo;
1408   return loop_vinfo;
1409 }
1410
1411
1412
1413 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1414    statements update the vectorization factor.  */
1415
1416 static void
1417 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1418 {
1419   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1420   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1421   int nbbs = loop->num_nodes;
1422   poly_uint64 vectorization_factor;
1423   int i;
1424
1425   if (dump_enabled_p ())
1426     dump_printf_loc (MSG_NOTE, vect_location,
1427                      "=== vect_update_vf_for_slp ===\n");
1428
1429   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1430   gcc_assert (known_ne (vectorization_factor, 0U));
1431
1432   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1433      vectorization factor of the loop is the unrolling factor required by
1434      the SLP instances.  If that unrolling factor is 1, we say, that we
1435      perform pure SLP on loop - cross iteration parallelism is not
1436      exploited.  */
1437   bool only_slp_in_loop = true;
1438   for (i = 0; i < nbbs; i++)
1439     {
1440       basic_block bb = bbs[i];
1441       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1442            gsi_next (&si))
1443         {
1444           gimple *stmt = gsi_stmt (si);
1445           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1446           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1447               && STMT_VINFO_RELATED_STMT (stmt_info))
1448             {
1449               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1450               stmt_info = vinfo_for_stmt (stmt);
1451             }
1452           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1453                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1454               && !PURE_SLP_STMT (stmt_info))
1455             /* STMT needs both SLP and loop-based vectorization.  */
1456             only_slp_in_loop = false;
1457         }
1458     }
1459
1460   if (only_slp_in_loop)
1461     {
1462       dump_printf_loc (MSG_NOTE, vect_location,
1463                        "Loop contains only SLP stmts\n");
1464       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1465     }
1466   else
1467     {
1468       dump_printf_loc (MSG_NOTE, vect_location,
1469                        "Loop contains SLP and non-SLP stmts\n");
1470       /* Both the vectorization factor and unroll factor have the form
1471          current_vector_size * X for some rational X, so they must have
1472          a common multiple.  */
1473       vectorization_factor
1474         = force_common_multiple (vectorization_factor,
1475                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1476     }
1477
1478   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1479   if (dump_enabled_p ())
1480     {
1481       dump_printf_loc (MSG_NOTE, vect_location,
1482                        "Updating vectorization factor to ");
1483       dump_dec (MSG_NOTE, vectorization_factor);
1484       dump_printf (MSG_NOTE, ".\n");
1485     }
1486 }
1487
1488 /* Return true if STMT_INFO describes a double reduction phi and if
1489    the other phi in the reduction is also relevant for vectorization.
1490    This rejects cases such as:
1491
1492       outer1:
1493         x_1 = PHI <x_3(outer2), ...>;
1494         ...
1495
1496       inner:
1497         x_2 = ...;
1498         ...
1499
1500       outer2:
1501         x_3 = PHI <x_2(inner)>;
1502
1503    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1504
1505 static bool
1506 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1507 {
1508   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1509     return false;
1510
1511   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1512   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1513 }
1514
1515 /* Function vect_analyze_loop_operations.
1516
1517    Scan the loop stmts and make sure they are all vectorizable.  */
1518
1519 static bool
1520 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1521 {
1522   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1523   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1524   int nbbs = loop->num_nodes;
1525   int i;
1526   stmt_vec_info stmt_info;
1527   bool need_to_vectorize = false;
1528   bool ok;
1529
1530   if (dump_enabled_p ())
1531     dump_printf_loc (MSG_NOTE, vect_location,
1532                      "=== vect_analyze_loop_operations ===\n");
1533
1534   stmt_vector_for_cost cost_vec;
1535   cost_vec.create (2);
1536
1537   for (i = 0; i < nbbs; i++)
1538     {
1539       basic_block bb = bbs[i];
1540
1541       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1542            gsi_next (&si))
1543         {
1544           gphi *phi = si.phi ();
1545           ok = true;
1546
1547           stmt_info = vinfo_for_stmt (phi);
1548           if (dump_enabled_p ())
1549             {
1550               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1551               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1552             }
1553           if (virtual_operand_p (gimple_phi_result (phi)))
1554             continue;
1555
1556           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1557              (i.e., a phi in the tail of the outer-loop).  */
1558           if (! is_loop_header_bb_p (bb))
1559             {
1560               /* FORNOW: we currently don't support the case that these phis
1561                  are not used in the outerloop (unless it is double reduction,
1562                  i.e., this phi is vect_reduction_def), cause this case
1563                  requires to actually do something here.  */
1564               if (STMT_VINFO_LIVE_P (stmt_info)
1565                   && !vect_active_double_reduction_p (stmt_info))
1566                 {
1567                   if (dump_enabled_p ())
1568                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1569                                      "Unsupported loop-closed phi in "
1570                                      "outer-loop.\n");
1571                   return false;
1572                 }
1573
1574               /* If PHI is used in the outer loop, we check that its operand
1575                  is defined in the inner loop.  */
1576               if (STMT_VINFO_RELEVANT_P (stmt_info))
1577                 {
1578                   tree phi_op;
1579                   gimple *op_def_stmt;
1580
1581                   if (gimple_phi_num_args (phi) != 1)
1582                     return false;
1583
1584                   phi_op = PHI_ARG_DEF (phi, 0);
1585                   if (TREE_CODE (phi_op) != SSA_NAME)
1586                     return false;
1587
1588                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1589                   if (gimple_nop_p (op_def_stmt)
1590                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1591                       || !vinfo_for_stmt (op_def_stmt))
1592                     return false;
1593
1594                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1595                         != vect_used_in_outer
1596                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1597                            != vect_used_in_outer_by_reduction)
1598                     return false;
1599                 }
1600
1601               continue;
1602             }
1603
1604           gcc_assert (stmt_info);
1605
1606           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1607                || STMT_VINFO_LIVE_P (stmt_info))
1608               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1609             {
1610               /* A scalar-dependence cycle that we don't support.  */
1611               if (dump_enabled_p ())
1612                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613                                  "not vectorized: scalar dependence cycle.\n");
1614               return false;
1615             }
1616
1617           if (STMT_VINFO_RELEVANT_P (stmt_info))
1618             {
1619               need_to_vectorize = true;
1620               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1621                   && ! PURE_SLP_STMT (stmt_info))
1622                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1623               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1624                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1625                        && ! PURE_SLP_STMT (stmt_info))
1626                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1627                                              &cost_vec);
1628             }
1629
1630           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1631           if (ok
1632               && STMT_VINFO_LIVE_P (stmt_info)
1633               && !PURE_SLP_STMT (stmt_info))
1634             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1635                                               &cost_vec);
1636
1637           if (!ok)
1638             {
1639               if (dump_enabled_p ())
1640                 {
1641                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1642                                    "not vectorized: relevant phi not "
1643                                    "supported: ");
1644                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1645                 }
1646               return false;
1647             }
1648         }
1649
1650       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1651            gsi_next (&si))
1652         {
1653           gimple *stmt = gsi_stmt (si);
1654           if (!gimple_clobber_p (stmt)
1655               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1656                                      &cost_vec))
1657             return false;
1658         }
1659     } /* bbs */
1660
1661   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1662   cost_vec.release ();
1663
1664   /* All operations in the loop are either irrelevant (deal with loop
1665      control, or dead), or only used outside the loop and can be moved
1666      out of the loop (e.g. invariants, inductions).  The loop can be
1667      optimized away by scalar optimizations.  We're better off not
1668      touching this loop.  */
1669   if (!need_to_vectorize)
1670     {
1671       if (dump_enabled_p ())
1672         dump_printf_loc (MSG_NOTE, vect_location,
1673                          "All the computation can be taken out of the loop.\n");
1674       if (dump_enabled_p ())
1675         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1676                          "not vectorized: redundant loop. no profit to "
1677                          "vectorize.\n");
1678       return false;
1679     }
1680
1681   return true;
1682 }
1683
1684 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1685    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1686    definitely no, or -1 if it's worth retrying.  */
1687
1688 static int
1689 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1690 {
1691   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1692   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1693
1694   /* Only fully-masked loops can have iteration counts less than the
1695      vectorization factor.  */
1696   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1697     {
1698       HOST_WIDE_INT max_niter;
1699
1700       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1701         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1702       else
1703         max_niter = max_stmt_executions_int (loop);
1704
1705       if (max_niter != -1
1706           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1707         {
1708           if (dump_enabled_p ())
1709             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710                              "not vectorized: iteration count smaller than "
1711                              "vectorization factor.\n");
1712           return 0;
1713         }
1714     }
1715
1716   int min_profitable_iters, min_profitable_estimate;
1717   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1718                                       &min_profitable_estimate);
1719
1720   if (min_profitable_iters < 0)
1721     {
1722       if (dump_enabled_p ())
1723         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724                          "not vectorized: vectorization not profitable.\n");
1725       if (dump_enabled_p ())
1726         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1727                          "not vectorized: vector version will never be "
1728                          "profitable.\n");
1729       return -1;
1730     }
1731
1732   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1733                                * assumed_vf);
1734
1735   /* Use the cost model only if it is more conservative than user specified
1736      threshold.  */
1737   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1738                                     min_profitable_iters);
1739
1740   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1741
1742   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1743       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1744     {
1745       if (dump_enabled_p ())
1746         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1747                          "not vectorized: vectorization not profitable.\n");
1748       if (dump_enabled_p ())
1749         dump_printf_loc (MSG_NOTE, vect_location,
1750                          "not vectorized: iteration count smaller than user "
1751                          "specified loop bound parameter or minimum profitable "
1752                          "iterations (whichever is more conservative).\n");
1753       return 0;
1754     }
1755
1756   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1757   if (estimated_niter == -1)
1758     estimated_niter = likely_max_stmt_executions_int (loop);
1759   if (estimated_niter != -1
1760       && ((unsigned HOST_WIDE_INT) estimated_niter
1761           < MAX (th, (unsigned) min_profitable_estimate)))
1762     {
1763       if (dump_enabled_p ())
1764         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1765                          "not vectorized: estimated iteration count too "
1766                          "small.\n");
1767       if (dump_enabled_p ())
1768         dump_printf_loc (MSG_NOTE, vect_location,
1769                          "not vectorized: estimated iteration count smaller "
1770                          "than specified loop bound parameter or minimum "
1771                          "profitable iterations (whichever is more "
1772                          "conservative).\n");
1773       return -1;
1774     }
1775
1776   return 1;
1777 }
1778
1779 static bool
1780 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1781                            vec<data_reference_p> *datarefs,
1782                            unsigned int *n_stmts)
1783 {
1784   *n_stmts = 0;
1785   for (unsigned i = 0; i < loop->num_nodes; i++)
1786     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1787          !gsi_end_p (gsi); gsi_next (&gsi))
1788       {
1789         gimple *stmt = gsi_stmt (gsi);
1790         if (is_gimple_debug (stmt))
1791           continue;
1792         ++(*n_stmts);
1793         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1794           {
1795             if (is_gimple_call (stmt) && loop->safelen)
1796               {
1797                 tree fndecl = gimple_call_fndecl (stmt), op;
1798                 if (fndecl != NULL_TREE)
1799                   {
1800                     cgraph_node *node = cgraph_node::get (fndecl);
1801                     if (node != NULL && node->simd_clones != NULL)
1802                       {
1803                         unsigned int j, n = gimple_call_num_args (stmt);
1804                         for (j = 0; j < n; j++)
1805                           {
1806                             op = gimple_call_arg (stmt, j);
1807                             if (DECL_P (op)
1808                                 || (REFERENCE_CLASS_P (op)
1809                                     && get_base_address (op)))
1810                               break;
1811                           }
1812                         op = gimple_call_lhs (stmt);
1813                         /* Ignore #pragma omp declare simd functions
1814                            if they don't have data references in the
1815                            call stmt itself.  */
1816                         if (j == n
1817                             && !(op
1818                                  && (DECL_P (op)
1819                                      || (REFERENCE_CLASS_P (op)
1820                                          && get_base_address (op)))))
1821                           continue;
1822                       }
1823                   }
1824               }
1825             return false;
1826           }
1827       }
1828   return true;
1829 }
1830
1831 /* Function vect_analyze_loop_2.
1832
1833    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1834    for it.  The different analyses will record information in the
1835    loop_vec_info struct.  */
1836 static bool
1837 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1838 {
1839   bool ok;
1840   int res;
1841   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1842   poly_uint64 min_vf = 2;
1843
1844   /* The first group of checks is independent of the vector size.  */
1845   fatal = true;
1846
1847   /* Find all data references in the loop (which correspond to vdefs/vuses)
1848      and analyze their evolution in the loop.  */
1849
1850   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1851   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1852     {
1853       if (dump_enabled_p ())
1854         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1855                          "not vectorized: loop nest containing two "
1856                          "or more consecutive inner loops cannot be "
1857                          "vectorized\n");
1858       return false;
1859     }
1860
1861   /* Gather the data references and count stmts in the loop.  */
1862   unsigned int n_stmts;
1863   if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1864                                   &LOOP_VINFO_DATAREFS (loop_vinfo),
1865                                   &n_stmts))
1866     {
1867       if (dump_enabled_p ())
1868         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1869                          "not vectorized: loop contains function "
1870                          "calls or data references that cannot "
1871                          "be analyzed\n");
1872       return false;
1873     }
1874
1875   /* Analyze the data references and also adjust the minimal
1876      vectorization factor according to the loads and stores.  */
1877
1878   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1879   if (!ok)
1880     {
1881       if (dump_enabled_p ())
1882         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1883                          "bad data references.\n");
1884       return false;
1885     }
1886
1887   /* Classify all cross-iteration scalar data-flow cycles.
1888      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1889   vect_analyze_scalar_cycles (loop_vinfo);
1890
1891   vect_pattern_recog (loop_vinfo);
1892
1893   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1894
1895   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1896      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1897
1898   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1899   if (!ok)
1900     {
1901       if (dump_enabled_p ())
1902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903                          "bad data access.\n");
1904       return false;
1905     }
1906
1907   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1908
1909   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1910   if (!ok)
1911     {
1912       if (dump_enabled_p ())
1913         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914                          "unexpected pattern.\n");
1915       return false;
1916     }
1917
1918   /* While the rest of the analysis below depends on it in some way.  */
1919   fatal = false;
1920
1921   /* Analyze data dependences between the data-refs in the loop
1922      and adjust the maximum vectorization factor according to
1923      the dependences.
1924      FORNOW: fail at the first data dependence that we encounter.  */
1925
1926   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1927   if (!ok
1928       || (max_vf != MAX_VECTORIZATION_FACTOR
1929           && maybe_lt (max_vf, min_vf)))
1930     {
1931       if (dump_enabled_p ())
1932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1933                              "bad data dependence.\n");
1934       return false;
1935     }
1936   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1937
1938   ok = vect_determine_vectorization_factor (loop_vinfo);
1939   if (!ok)
1940     {
1941       if (dump_enabled_p ())
1942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943                          "can't determine vectorization factor.\n");
1944       return false;
1945     }
1946   if (max_vf != MAX_VECTORIZATION_FACTOR
1947       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1948     {
1949       if (dump_enabled_p ())
1950         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1951                          "bad data dependence.\n");
1952       return false;
1953     }
1954
1955   /* Compute the scalar iteration cost.  */
1956   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1957
1958   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959   unsigned th;
1960
1961   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1962   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1963   if (!ok)
1964     return false;
1965
1966   /* If there are any SLP instances mark them as pure_slp.  */
1967   bool slp = vect_make_slp_decision (loop_vinfo);
1968   if (slp)
1969     {
1970       /* Find stmts that need to be both vectorized and SLPed.  */
1971       vect_detect_hybrid_slp (loop_vinfo);
1972
1973       /* Update the vectorization factor based on the SLP decision.  */
1974       vect_update_vf_for_slp (loop_vinfo);
1975     }
1976
1977   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1978
1979   /* We don't expect to have to roll back to anything other than an empty
1980      set of rgroups.  */
1981   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1982
1983   /* This is the point where we can re-start analysis with SLP forced off.  */
1984 start_over:
1985
1986   /* Now the vectorization factor is final.  */
1987   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1988   gcc_assert (known_ne (vectorization_factor, 0U));
1989
1990   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1991     {
1992       dump_printf_loc (MSG_NOTE, vect_location,
1993                        "vectorization_factor = ");
1994       dump_dec (MSG_NOTE, vectorization_factor);
1995       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1996                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1997     }
1998
1999   HOST_WIDE_INT max_niter
2000     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2001
2002   /* Analyze the alignment of the data-refs in the loop.
2003      Fail if a data reference is found that cannot be vectorized.  */
2004
2005   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2006   if (!ok)
2007     {
2008       if (dump_enabled_p ())
2009         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2010                          "bad data alignment.\n");
2011       return false;
2012     }
2013
2014   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2015      It is important to call pruning after vect_analyze_data_ref_accesses,
2016      since we use grouping information gathered by interleaving analysis.  */
2017   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2018   if (!ok)
2019     return false;
2020
2021   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2022      vectorization.  */
2023   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2024     {
2025     /* This pass will decide on using loop versioning and/or loop peeling in
2026        order to enhance the alignment of data references in the loop.  */
2027     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2028     if (!ok)
2029       {
2030         if (dump_enabled_p ())
2031           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032                            "bad data alignment.\n");
2033         return false;
2034       }
2035     }
2036
2037   if (slp)
2038     {
2039       /* Analyze operations in the SLP instances.  Note this may
2040          remove unsupported SLP instances which makes the above
2041          SLP kind detection invalid.  */
2042       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2043       vect_slp_analyze_operations (loop_vinfo);
2044       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2045         goto again;
2046     }
2047
2048   /* Scan all the remaining operations in the loop that are not subject
2049      to SLP and make sure they are vectorizable.  */
2050   ok = vect_analyze_loop_operations (loop_vinfo);
2051   if (!ok)
2052     {
2053       if (dump_enabled_p ())
2054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2055                          "bad operation or unsupported loop bound.\n");
2056       return false;
2057     }
2058
2059   /* Decide whether to use a fully-masked loop for this vectorization
2060      factor.  */
2061   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2062     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2063        && vect_verify_full_masking (loop_vinfo));
2064   if (dump_enabled_p ())
2065     {
2066       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2067         dump_printf_loc (MSG_NOTE, vect_location,
2068                          "using a fully-masked loop.\n");
2069       else
2070         dump_printf_loc (MSG_NOTE, vect_location,
2071                          "not using a fully-masked loop.\n");
2072     }
2073
2074   /* If epilog loop is required because of data accesses with gaps,
2075      one additional iteration needs to be peeled.  Check if there is
2076      enough iterations for vectorization.  */
2077   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2078       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2079       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2080     {
2081       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2082       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2083
2084       if (known_lt (wi::to_widest (scalar_niters), vf))
2085         {
2086           if (dump_enabled_p ())
2087             dump_printf_loc (MSG_NOTE, vect_location,
2088                              "loop has no enough iterations to support"
2089                              " peeling for gaps.\n");
2090           return false;
2091         }
2092     }
2093
2094   /* Check the costings of the loop make vectorizing worthwhile.  */
2095   res = vect_analyze_loop_costing (loop_vinfo);
2096   if (res < 0)
2097     goto again;
2098   if (!res)
2099     {
2100       if (dump_enabled_p ())
2101         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2102                          "Loop costings not worthwhile.\n");
2103       return false;
2104     }
2105
2106   /* Decide whether we need to create an epilogue loop to handle
2107      remaining scalar iterations.  */
2108   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2109
2110   unsigned HOST_WIDE_INT const_vf;
2111   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2112     /* The main loop handles all iterations.  */
2113     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2114   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2115            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2116     {
2117       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2118                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2119                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2120         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2121     }
2122   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2123            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2124            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2125                 < (unsigned) exact_log2 (const_vf))
2126                /* In case of versioning, check if the maximum number of
2127                   iterations is greater than th.  If they are identical,
2128                   the epilogue is unnecessary.  */
2129                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2130                    || ((unsigned HOST_WIDE_INT) max_niter
2131                        > (th / const_vf) * const_vf))))
2132     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2133
2134   /* If an epilogue loop is required make sure we can create one.  */
2135   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2136       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2137     {
2138       if (dump_enabled_p ())
2139         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2140       if (!vect_can_advance_ivs_p (loop_vinfo)
2141           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2142                                            single_exit (LOOP_VINFO_LOOP
2143                                                          (loop_vinfo))))
2144         {
2145           if (dump_enabled_p ())
2146             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147                              "not vectorized: can't create required "
2148                              "epilog loop\n");
2149           goto again;
2150         }
2151     }
2152
2153   /* During peeling, we need to check if number of loop iterations is
2154      enough for both peeled prolog loop and vector loop.  This check
2155      can be merged along with threshold check of loop versioning, so
2156      increase threshold for this case if necessary.  */
2157   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2158     {
2159       poly_uint64 niters_th = 0;
2160
2161       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2162         {
2163           /* Niters for peeled prolog loop.  */
2164           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2165             {
2166               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2167               tree vectype
2168                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2169               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2170             }
2171           else
2172             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2173         }
2174
2175       /* Niters for at least one iteration of vectorized loop.  */
2176       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2177         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2178       /* One additional iteration because of peeling for gap.  */
2179       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2180         niters_th += 1;
2181       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2182     }
2183
2184   gcc_assert (known_eq (vectorization_factor,
2185                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2186
2187   /* Ok to vectorize!  */
2188   return true;
2189
2190 again:
2191   /* Try again with SLP forced off but if we didn't do any SLP there is
2192      no point in re-trying.  */
2193   if (!slp)
2194     return false;
2195
2196   /* If there are reduction chains re-trying will fail anyway.  */
2197   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2198     return false;
2199
2200   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2201      via interleaving or lane instructions.  */
2202   slp_instance instance;
2203   slp_tree node;
2204   unsigned i, j;
2205   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2206     {
2207       stmt_vec_info vinfo;
2208       vinfo = vinfo_for_stmt
2209           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2210       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2211         continue;
2212       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2213       unsigned int size = DR_GROUP_SIZE (vinfo);
2214       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2215       if (! vect_store_lanes_supported (vectype, size, false)
2216          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2217          && ! vect_grouped_store_supported (vectype, size))
2218        return false;
2219       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2220         {
2221           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2222           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2223           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2224           size = DR_GROUP_SIZE (vinfo);
2225           vectype = STMT_VINFO_VECTYPE (vinfo);
2226           if (! vect_load_lanes_supported (vectype, size, false)
2227               && ! vect_grouped_load_supported (vectype, single_element_p,
2228                                                 size))
2229             return false;
2230         }
2231     }
2232
2233   if (dump_enabled_p ())
2234     dump_printf_loc (MSG_NOTE, vect_location,
2235                      "re-trying with SLP disabled\n");
2236
2237   /* Roll back state appropriately.  No SLP this time.  */
2238   slp = false;
2239   /* Restore vectorization factor as it were without SLP.  */
2240   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2241   /* Free the SLP instances.  */
2242   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2243     vect_free_slp_instance (instance);
2244   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2245   /* Reset SLP type to loop_vect on all stmts.  */
2246   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2247     {
2248       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2249       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2250            !gsi_end_p (si); gsi_next (&si))
2251         {
2252           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2253           STMT_SLP_TYPE (stmt_info) = loop_vect;
2254         }
2255       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2256            !gsi_end_p (si); gsi_next (&si))
2257         {
2258           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2259           STMT_SLP_TYPE (stmt_info) = loop_vect;
2260           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2261             {
2262               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2263               STMT_SLP_TYPE (stmt_info) = loop_vect;
2264               for (gimple_stmt_iterator pi
2265                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2266                    !gsi_end_p (pi); gsi_next (&pi))
2267                 {
2268                   gimple *pstmt = gsi_stmt (pi);
2269                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2270                 }
2271             }
2272         }
2273     }
2274   /* Free optimized alias test DDRS.  */
2275   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2276   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2277   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2278   /* Reset target cost data.  */
2279   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2280   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2281     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2282   /* Reset accumulated rgroup information.  */
2283   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2284   /* Reset assorted flags.  */
2285   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2286   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2287   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2288   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2289   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2290
2291   goto start_over;
2292 }
2293
2294 /* Function vect_analyze_loop.
2295
2296    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2297    for it.  The different analyses will record information in the
2298    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2299    be vectorized.  */
2300 loop_vec_info
2301 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2302 {
2303   loop_vec_info loop_vinfo;
2304   auto_vector_sizes vector_sizes;
2305
2306   /* Autodetect first vector size we try.  */
2307   current_vector_size = 0;
2308   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2309   unsigned int next_size = 0;
2310
2311   if (dump_enabled_p ())
2312     dump_printf_loc (MSG_NOTE, vect_location,
2313                      "===== analyze_loop_nest =====\n");
2314
2315   if (loop_outer (loop)
2316       && loop_vec_info_for_loop (loop_outer (loop))
2317       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2318     {
2319       if (dump_enabled_p ())
2320         dump_printf_loc (MSG_NOTE, vect_location,
2321                          "outer-loop already vectorized.\n");
2322       return NULL;
2323     }
2324
2325   poly_uint64 autodetected_vector_size = 0;
2326   while (1)
2327     {
2328       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2329       loop_vinfo = vect_analyze_loop_form (loop);
2330       if (!loop_vinfo)
2331         {
2332           if (dump_enabled_p ())
2333             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334                              "bad loop form.\n");
2335           return NULL;
2336         }
2337
2338       bool fatal = false;
2339
2340       if (orig_loop_vinfo)
2341         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2342
2343       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2344         {
2345           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2346
2347           return loop_vinfo;
2348         }
2349
2350       delete loop_vinfo;
2351
2352       if (next_size == 0)
2353         autodetected_vector_size = current_vector_size;
2354
2355       if (next_size < vector_sizes.length ()
2356           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2357         next_size += 1;
2358
2359       if (fatal
2360           || next_size == vector_sizes.length ()
2361           || known_eq (current_vector_size, 0U))
2362         return NULL;
2363
2364       /* Try the next biggest vector size.  */
2365       current_vector_size = vector_sizes[next_size++];
2366       if (dump_enabled_p ())
2367         {
2368           dump_printf_loc (MSG_NOTE, vect_location,
2369                            "***** Re-trying analysis with "
2370                            "vector size ");
2371           dump_dec (MSG_NOTE, current_vector_size);
2372           dump_printf (MSG_NOTE, "\n");
2373         }
2374     }
2375 }
2376
2377 /* Return true if there is an in-order reduction function for CODE, storing
2378    it in *REDUC_FN if so.  */
2379
2380 static bool
2381 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2382 {
2383   switch (code)
2384     {
2385     case PLUS_EXPR:
2386       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2387       return true;
2388
2389     default:
2390       return false;
2391     }
2392 }
2393
2394 /* Function reduction_fn_for_scalar_code
2395
2396    Input:
2397    CODE - tree_code of a reduction operations.
2398
2399    Output:
2400    REDUC_FN - the corresponding internal function to be used to reduce the
2401       vector of partial results into a single scalar result, or IFN_LAST
2402       if the operation is a supported reduction operation, but does not have
2403       such an internal function.
2404
2405    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2406
2407 static bool
2408 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2409 {
2410   switch (code)
2411     {
2412       case MAX_EXPR:
2413         *reduc_fn = IFN_REDUC_MAX;
2414         return true;
2415
2416       case MIN_EXPR:
2417         *reduc_fn = IFN_REDUC_MIN;
2418         return true;
2419
2420       case PLUS_EXPR:
2421         *reduc_fn = IFN_REDUC_PLUS;
2422         return true;
2423
2424       case BIT_AND_EXPR:
2425         *reduc_fn = IFN_REDUC_AND;
2426         return true;
2427
2428       case BIT_IOR_EXPR:
2429         *reduc_fn = IFN_REDUC_IOR;
2430         return true;
2431
2432       case BIT_XOR_EXPR:
2433         *reduc_fn = IFN_REDUC_XOR;
2434         return true;
2435
2436       case MULT_EXPR:
2437       case MINUS_EXPR:
2438         *reduc_fn = IFN_LAST;
2439         return true;
2440
2441       default:
2442        return false;
2443     }
2444 }
2445
2446 /* If there is a neutral value X such that SLP reduction NODE would not
2447    be affected by the introduction of additional X elements, return that X,
2448    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2449    is true if the SLP statements perform a single reduction, false if each
2450    statement performs an independent reduction.  */
2451
2452 static tree
2453 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2454                               bool reduc_chain)
2455 {
2456   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2457   gimple *stmt = stmts[0];
2458   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2459   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2460   tree scalar_type = TREE_TYPE (vector_type);
2461   struct loop *loop = gimple_bb (stmt)->loop_father;
2462   gcc_assert (loop);
2463
2464   switch (code)
2465     {
2466     case WIDEN_SUM_EXPR:
2467     case DOT_PROD_EXPR:
2468     case SAD_EXPR:
2469     case PLUS_EXPR:
2470     case MINUS_EXPR:
2471     case BIT_IOR_EXPR:
2472     case BIT_XOR_EXPR:
2473       return build_zero_cst (scalar_type);
2474
2475     case MULT_EXPR:
2476       return build_one_cst (scalar_type);
2477
2478     case BIT_AND_EXPR:
2479       return build_all_ones_cst (scalar_type);
2480
2481     case MAX_EXPR:
2482     case MIN_EXPR:
2483       /* For MIN/MAX the initial values are neutral.  A reduction chain
2484          has only a single initial value, so that value is neutral for
2485          all statements.  */
2486       if (reduc_chain)
2487         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2488       return NULL_TREE;
2489
2490     default:
2491       return NULL_TREE;
2492     }
2493 }
2494
2495 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2496    STMT is printed with a message MSG. */
2497
2498 static void
2499 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2500 {
2501   dump_printf_loc (msg_type, vect_location, "%s", msg);
2502   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2503 }
2504
2505
2506 /* Detect SLP reduction of the form:
2507
2508    #a1 = phi <a5, a0>
2509    a2 = operation (a1)
2510    a3 = operation (a2)
2511    a4 = operation (a3)
2512    a5 = operation (a4)
2513
2514    #a = phi <a5>
2515
2516    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2517    FIRST_STMT is the first reduction stmt in the chain
2518    (a2 = operation (a1)).
2519
2520    Return TRUE if a reduction chain was detected.  */
2521
2522 static bool
2523 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2524                        gimple *first_stmt)
2525 {
2526   struct loop *loop = (gimple_bb (phi))->loop_father;
2527   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2528   enum tree_code code;
2529   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2530   stmt_vec_info use_stmt_info, current_stmt_info;
2531   tree lhs;
2532   imm_use_iterator imm_iter;
2533   use_operand_p use_p;
2534   int nloop_uses, size = 0, n_out_of_loop_uses;
2535   bool found = false;
2536
2537   if (loop != vect_loop)
2538     return false;
2539
2540   lhs = PHI_RESULT (phi);
2541   code = gimple_assign_rhs_code (first_stmt);
2542   while (1)
2543     {
2544       nloop_uses = 0;
2545       n_out_of_loop_uses = 0;
2546       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2547         {
2548           gimple *use_stmt = USE_STMT (use_p);
2549           if (is_gimple_debug (use_stmt))
2550             continue;
2551
2552           /* Check if we got back to the reduction phi.  */
2553           if (use_stmt == phi)
2554             {
2555               loop_use_stmt = use_stmt;
2556               found = true;
2557               break;
2558             }
2559
2560           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2561             {
2562               loop_use_stmt = use_stmt;
2563               nloop_uses++;
2564             }
2565            else
2566              n_out_of_loop_uses++;
2567
2568            /* There are can be either a single use in the loop or two uses in
2569               phi nodes.  */
2570            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2571              return false;
2572         }
2573
2574       if (found)
2575         break;
2576
2577       /* We reached a statement with no loop uses.  */
2578       if (nloop_uses == 0)
2579         return false;
2580
2581       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2582       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2583         return false;
2584
2585       if (!is_gimple_assign (loop_use_stmt)
2586           || code != gimple_assign_rhs_code (loop_use_stmt)
2587           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2588         return false;
2589
2590       /* Insert USE_STMT into reduction chain.  */
2591       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2592       if (current_stmt)
2593         {
2594           current_stmt_info = vinfo_for_stmt (current_stmt);
2595           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2596           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2597             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2598         }
2599       else
2600         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2601
2602       lhs = gimple_assign_lhs (loop_use_stmt);
2603       current_stmt = loop_use_stmt;
2604       size++;
2605    }
2606
2607   if (!found || loop_use_stmt != phi || size < 2)
2608     return false;
2609
2610   /* Swap the operands, if needed, to make the reduction operand be the second
2611      operand.  */
2612   lhs = PHI_RESULT (phi);
2613   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2614   while (next_stmt)
2615     {
2616       if (gimple_assign_rhs2 (next_stmt) == lhs)
2617         {
2618           tree op = gimple_assign_rhs1 (next_stmt);
2619           gimple *def_stmt = NULL;
2620
2621           if (TREE_CODE (op) == SSA_NAME)
2622             def_stmt = SSA_NAME_DEF_STMT (op);
2623
2624           /* Check that the other def is either defined in the loop
2625              ("vect_internal_def"), or it's an induction (defined by a
2626              loop-header phi-node).  */
2627           if (def_stmt
2628               && gimple_bb (def_stmt)
2629               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2630               && (is_gimple_assign (def_stmt)
2631                   || is_gimple_call (def_stmt)
2632                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2633                            == vect_induction_def
2634                   || (gimple_code (def_stmt) == GIMPLE_PHI
2635                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2636                                   == vect_internal_def
2637                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2638             {
2639               lhs = gimple_assign_lhs (next_stmt);
2640               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2641               continue;
2642             }
2643
2644           return false;
2645         }
2646       else
2647         {
2648           tree op = gimple_assign_rhs2 (next_stmt);
2649           gimple *def_stmt = NULL;
2650
2651           if (TREE_CODE (op) == SSA_NAME)
2652             def_stmt = SSA_NAME_DEF_STMT (op);
2653
2654           /* Check that the other def is either defined in the loop
2655             ("vect_internal_def"), or it's an induction (defined by a
2656             loop-header phi-node).  */
2657           if (def_stmt
2658               && gimple_bb (def_stmt)
2659               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2660               && (is_gimple_assign (def_stmt)
2661                   || is_gimple_call (def_stmt)
2662                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2663                               == vect_induction_def
2664                   || (gimple_code (def_stmt) == GIMPLE_PHI
2665                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2666                                   == vect_internal_def
2667                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2668             {
2669               if (dump_enabled_p ())
2670                 {
2671                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2672                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2673                 }
2674
2675               swap_ssa_operands (next_stmt,
2676                                  gimple_assign_rhs1_ptr (next_stmt),
2677                                  gimple_assign_rhs2_ptr (next_stmt));
2678               update_stmt (next_stmt);
2679
2680               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2681                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2682             }
2683           else
2684             return false;
2685         }
2686
2687       lhs = gimple_assign_lhs (next_stmt);
2688       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2689     }
2690
2691   /* Save the chain for further analysis in SLP detection.  */
2692   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2693   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2694   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2695
2696   return true;
2697 }
2698
2699 /* Return true if we need an in-order reduction for operation CODE
2700    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2701    overflow must wrap.  */
2702
2703 static bool
2704 needs_fold_left_reduction_p (tree type, tree_code code,
2705                              bool need_wrapping_integral_overflow)
2706 {
2707   /* CHECKME: check for !flag_finite_math_only too?  */
2708   if (SCALAR_FLOAT_TYPE_P (type))
2709     switch (code)
2710       {
2711       case MIN_EXPR:
2712       case MAX_EXPR:
2713         return false;
2714
2715       default:
2716         return !flag_associative_math;
2717       }
2718
2719   if (INTEGRAL_TYPE_P (type))
2720     {
2721       if (!operation_no_trapping_overflow (type, code))
2722         return true;
2723       if (need_wrapping_integral_overflow
2724           && !TYPE_OVERFLOW_WRAPS (type)
2725           && operation_can_overflow (code))
2726         return true;
2727       return false;
2728     }
2729
2730   if (SAT_FIXED_POINT_TYPE_P (type))
2731     return true;
2732
2733   return false;
2734 }
2735
2736 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2737    reduction operation CODE has a handled computation expression.  */
2738
2739 bool
2740 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2741                       enum tree_code code)
2742 {
2743   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2744   auto_bitmap visited;
2745   tree lookfor = PHI_RESULT (phi);
2746   ssa_op_iter curri;
2747   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2748   while (USE_FROM_PTR (curr) != loop_arg)
2749     curr = op_iter_next_use (&curri);
2750   curri.i = curri.numops;
2751   do
2752     {
2753       path.safe_push (std::make_pair (curri, curr));
2754       tree use = USE_FROM_PTR (curr);
2755       if (use == lookfor)
2756         break;
2757       gimple *def = SSA_NAME_DEF_STMT (use);
2758       if (gimple_nop_p (def)
2759           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2760         {
2761 pop:
2762           do
2763             {
2764               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2765               curri = x.first;
2766               curr = x.second;
2767               do
2768                 curr = op_iter_next_use (&curri);
2769               /* Skip already visited or non-SSA operands (from iterating
2770                  over PHI args).  */
2771               while (curr != NULL_USE_OPERAND_P
2772                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2773                          || ! bitmap_set_bit (visited,
2774                                               SSA_NAME_VERSION
2775                                                 (USE_FROM_PTR (curr)))));
2776             }
2777           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2778           if (curr == NULL_USE_OPERAND_P)
2779             break;
2780         }
2781       else
2782         {
2783           if (gimple_code (def) == GIMPLE_PHI)
2784             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2785           else
2786             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2787           while (curr != NULL_USE_OPERAND_P
2788                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2789                      || ! bitmap_set_bit (visited,
2790                                           SSA_NAME_VERSION
2791                                             (USE_FROM_PTR (curr)))))
2792             curr = op_iter_next_use (&curri);
2793           if (curr == NULL_USE_OPERAND_P)
2794             goto pop;
2795         }
2796     }
2797   while (1);
2798   if (dump_file && (dump_flags & TDF_DETAILS))
2799     {
2800       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2801       unsigned i;
2802       std::pair<ssa_op_iter, use_operand_p> *x;
2803       FOR_EACH_VEC_ELT (path, i, x)
2804         {
2805           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2806           dump_printf (MSG_NOTE, " ");
2807         }
2808       dump_printf (MSG_NOTE, "\n");
2809     }
2810
2811   /* Check whether the reduction path detected is valid.  */
2812   bool fail = path.length () == 0;
2813   bool neg = false;
2814   for (unsigned i = 1; i < path.length (); ++i)
2815     {
2816       gimple *use_stmt = USE_STMT (path[i].second);
2817       tree op = USE_FROM_PTR (path[i].second);
2818       if (! has_single_use (op)
2819           || ! is_gimple_assign (use_stmt))
2820         {
2821           fail = true;
2822           break;
2823         }
2824       if (gimple_assign_rhs_code (use_stmt) != code)
2825         {
2826           if (code == PLUS_EXPR
2827               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2828             {
2829               /* Track whether we negate the reduction value each iteration.  */
2830               if (gimple_assign_rhs2 (use_stmt) == op)
2831                 neg = ! neg;
2832             }
2833           else
2834             {
2835               fail = true;
2836               break;
2837             }
2838         }
2839     }
2840   return ! fail && ! neg;
2841 }
2842
2843
2844 /* Function vect_is_simple_reduction
2845
2846    (1) Detect a cross-iteration def-use cycle that represents a simple
2847    reduction computation.  We look for the following pattern:
2848
2849    loop_header:
2850      a1 = phi < a0, a2 >
2851      a3 = ...
2852      a2 = operation (a3, a1)
2853
2854    or
2855
2856    a3 = ...
2857    loop_header:
2858      a1 = phi < a0, a2 >
2859      a2 = operation (a3, a1)
2860
2861    such that:
2862    1. operation is commutative and associative and it is safe to
2863       change the order of the computation
2864    2. no uses for a2 in the loop (a2 is used out of the loop)
2865    3. no uses of a1 in the loop besides the reduction operation
2866    4. no uses of a1 outside the loop.
2867
2868    Conditions 1,4 are tested here.
2869    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2870
2871    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2872    nested cycles.
2873
2874    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2875    reductions:
2876
2877      a1 = phi < a0, a2 >
2878      inner loop (def of a3)
2879      a2 = phi < a3 >
2880
2881    (4) Detect condition expressions, ie:
2882      for (int i = 0; i < N; i++)
2883        if (a[i] < val)
2884         ret_val = a[i];
2885
2886 */
2887
2888 static gimple *
2889 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2890                           bool *double_reduc,
2891                           bool need_wrapping_integral_overflow,
2892                           enum vect_reduction_type *v_reduc_type)
2893 {
2894   struct loop *loop = (gimple_bb (phi))->loop_father;
2895   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2896   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2897   enum tree_code orig_code, code;
2898   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2899   tree type;
2900   int nloop_uses;
2901   tree name;
2902   imm_use_iterator imm_iter;
2903   use_operand_p use_p;
2904   bool phi_def;
2905
2906   *double_reduc = false;
2907   *v_reduc_type = TREE_CODE_REDUCTION;
2908
2909   tree phi_name = PHI_RESULT (phi);
2910   /* ???  If there are no uses of the PHI result the inner loop reduction
2911      won't be detected as possibly double-reduction by vectorizable_reduction
2912      because that tries to walk the PHI arg from the preheader edge which
2913      can be constant.  See PR60382.  */
2914   if (has_zero_uses (phi_name))
2915     return NULL;
2916   nloop_uses = 0;
2917   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2918     {
2919       gimple *use_stmt = USE_STMT (use_p);
2920       if (is_gimple_debug (use_stmt))
2921         continue;
2922
2923       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2924         {
2925           if (dump_enabled_p ())
2926             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2927                              "intermediate value used outside loop.\n");
2928
2929           return NULL;
2930         }
2931
2932       nloop_uses++;
2933       if (nloop_uses > 1)
2934         {
2935           if (dump_enabled_p ())
2936             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2937                              "reduction value used in loop.\n");
2938           return NULL;
2939         }
2940
2941       phi_use_stmt = use_stmt;
2942     }
2943
2944   edge latch_e = loop_latch_edge (loop);
2945   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2946   if (TREE_CODE (loop_arg) != SSA_NAME)
2947     {
2948       if (dump_enabled_p ())
2949         {
2950           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2951                            "reduction: not ssa_name: ");
2952           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2953           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2954         }
2955       return NULL;
2956     }
2957
2958   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2959   if (is_gimple_assign (def_stmt))
2960     {
2961       name = gimple_assign_lhs (def_stmt);
2962       phi_def = false;
2963     }
2964   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2965     {
2966       name = PHI_RESULT (def_stmt);
2967       phi_def = true;
2968     }
2969   else
2970     {
2971       if (dump_enabled_p ())
2972         {
2973           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974                            "reduction: unhandled reduction operation: ");
2975           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2976         }
2977       return NULL;
2978     }
2979
2980   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2981     return NULL;
2982
2983   nloop_uses = 0;
2984   auto_vec<gphi *, 3> lcphis;
2985   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2986     {
2987       gimple *use_stmt = USE_STMT (use_p);
2988       if (is_gimple_debug (use_stmt))
2989         continue;
2990       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2991         nloop_uses++;
2992       else
2993         /* We can have more than one loop-closed PHI.  */
2994         lcphis.safe_push (as_a <gphi *> (use_stmt));
2995       if (nloop_uses > 1)
2996         {
2997           if (dump_enabled_p ())
2998             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2999                              "reduction used in loop.\n");
3000           return NULL;
3001         }
3002     }
3003
3004   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3005      defined in the inner loop.  */
3006   if (phi_def)
3007     {
3008       op1 = PHI_ARG_DEF (def_stmt, 0);
3009
3010       if (gimple_phi_num_args (def_stmt) != 1
3011           || TREE_CODE (op1) != SSA_NAME)
3012         {
3013           if (dump_enabled_p ())
3014             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3015                              "unsupported phi node definition.\n");
3016
3017           return NULL;
3018         }
3019
3020       def1 = SSA_NAME_DEF_STMT (op1);
3021       if (gimple_bb (def1)
3022           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3023           && loop->inner
3024           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3025           && is_gimple_assign (def1)
3026           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3027         {
3028           if (dump_enabled_p ())
3029             report_vect_op (MSG_NOTE, def_stmt,
3030                             "detected double reduction: ");
3031
3032           *double_reduc = true;
3033           return def_stmt;
3034         }
3035
3036       return NULL;
3037     }
3038
3039   /* If we are vectorizing an inner reduction we are executing that
3040      in the original order only in case we are not dealing with a
3041      double reduction.  */
3042   bool check_reduction = true;
3043   if (flow_loop_nested_p (vect_loop, loop))
3044     {
3045       gphi *lcphi;
3046       unsigned i;
3047       check_reduction = false;
3048       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3049         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3050           {
3051             gimple *use_stmt = USE_STMT (use_p);
3052             if (is_gimple_debug (use_stmt))
3053               continue;
3054             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3055               check_reduction = true;
3056           }
3057     }
3058
3059   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3060   code = orig_code = gimple_assign_rhs_code (def_stmt);
3061
3062   /* We can handle "res -= x[i]", which is non-associative by
3063      simply rewriting this into "res += -x[i]".  Avoid changing
3064      gimple instruction for the first simple tests and only do this
3065      if we're allowed to change code at all.  */
3066   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3067     code = PLUS_EXPR;
3068
3069   if (code == COND_EXPR)
3070     {
3071       if (! nested_in_vect_loop)
3072         *v_reduc_type = COND_REDUCTION;
3073
3074       op3 = gimple_assign_rhs1 (def_stmt);
3075       if (COMPARISON_CLASS_P (op3))
3076         {
3077           op4 = TREE_OPERAND (op3, 1);
3078           op3 = TREE_OPERAND (op3, 0);
3079         }
3080       if (op3 == phi_name || op4 == phi_name)
3081         {
3082           if (dump_enabled_p ())
3083             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3084                             "reduction: condition depends on previous"
3085                             " iteration: ");
3086           return NULL;
3087         }
3088
3089       op1 = gimple_assign_rhs2 (def_stmt);
3090       op2 = gimple_assign_rhs3 (def_stmt);
3091     }
3092   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3093     {
3094       if (dump_enabled_p ())
3095         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3096                         "reduction: not commutative/associative: ");
3097       return NULL;
3098     }
3099   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3100     {
3101       op1 = gimple_assign_rhs1 (def_stmt);
3102       op2 = gimple_assign_rhs2 (def_stmt);
3103     }
3104   else
3105     {
3106       if (dump_enabled_p ())
3107         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3108                         "reduction: not handled operation: ");
3109       return NULL;
3110     }
3111
3112   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3113     {
3114       if (dump_enabled_p ())
3115         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3116                         "reduction: both uses not ssa_names: ");
3117
3118       return NULL;
3119     }
3120
3121   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3122   if ((TREE_CODE (op1) == SSA_NAME
3123        && !types_compatible_p (type,TREE_TYPE (op1)))
3124       || (TREE_CODE (op2) == SSA_NAME
3125           && !types_compatible_p (type, TREE_TYPE (op2)))
3126       || (op3 && TREE_CODE (op3) == SSA_NAME
3127           && !types_compatible_p (type, TREE_TYPE (op3)))
3128       || (op4 && TREE_CODE (op4) == SSA_NAME
3129           && !types_compatible_p (type, TREE_TYPE (op4))))
3130     {
3131       if (dump_enabled_p ())
3132         {
3133           dump_printf_loc (MSG_NOTE, vect_location,
3134                            "reduction: multiple types: operation type: ");
3135           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3136           dump_printf (MSG_NOTE, ", operands types: ");
3137           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3138                              TREE_TYPE (op1));
3139           dump_printf (MSG_NOTE, ",");
3140           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3141                              TREE_TYPE (op2));
3142           if (op3)
3143             {
3144               dump_printf (MSG_NOTE, ",");
3145               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3146                                  TREE_TYPE (op3));
3147             }
3148
3149           if (op4)
3150             {
3151               dump_printf (MSG_NOTE, ",");
3152               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3153                                  TREE_TYPE (op4));
3154             }
3155           dump_printf (MSG_NOTE, "\n");
3156         }
3157
3158       return NULL;
3159     }
3160
3161   /* Check whether it's ok to change the order of the computation.
3162      Generally, when vectorizing a reduction we change the order of the
3163      computation.  This may change the behavior of the program in some
3164      cases, so we need to check that this is ok.  One exception is when
3165      vectorizing an outer-loop: the inner-loop is executed sequentially,
3166      and therefore vectorizing reductions in the inner-loop during
3167      outer-loop vectorization is safe.  */
3168   if (check_reduction
3169       && *v_reduc_type == TREE_CODE_REDUCTION
3170       && needs_fold_left_reduction_p (type, code,
3171                                       need_wrapping_integral_overflow))
3172     *v_reduc_type = FOLD_LEFT_REDUCTION;
3173
3174   /* Reduction is safe. We're dealing with one of the following:
3175      1) integer arithmetic and no trapv
3176      2) floating point arithmetic, and special flags permit this optimization
3177      3) nested cycle (i.e., outer loop vectorization).  */
3178   if (TREE_CODE (op1) == SSA_NAME)
3179     def1 = SSA_NAME_DEF_STMT (op1);
3180
3181   if (TREE_CODE (op2) == SSA_NAME)
3182     def2 = SSA_NAME_DEF_STMT (op2);
3183
3184   if (code != COND_EXPR
3185       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3186     {
3187       if (dump_enabled_p ())
3188         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3189       return NULL;
3190     }
3191
3192   /* Check that one def is the reduction def, defined by PHI,
3193      the other def is either defined in the loop ("vect_internal_def"),
3194      or it's an induction (defined by a loop-header phi-node).  */
3195
3196   if (def2 && def2 == phi
3197       && (code == COND_EXPR
3198           || !def1 || gimple_nop_p (def1)
3199           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3200           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3201               && (is_gimple_assign (def1)
3202                   || is_gimple_call (def1)
3203                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3204                       == vect_induction_def
3205                   || (gimple_code (def1) == GIMPLE_PHI
3206                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3207                           == vect_internal_def
3208                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3209     {
3210       if (dump_enabled_p ())
3211         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3212       return def_stmt;
3213     }
3214
3215   if (def1 && def1 == phi
3216       && (code == COND_EXPR
3217           || !def2 || gimple_nop_p (def2)
3218           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3219           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3220               && (is_gimple_assign (def2)
3221                   || is_gimple_call (def2)
3222                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3223                        == vect_induction_def
3224                   || (gimple_code (def2) == GIMPLE_PHI
3225                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3226                            == vect_internal_def
3227                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3228     {
3229       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3230         {
3231           /* Check if we can swap operands (just for simplicity - so that
3232              the rest of the code can assume that the reduction variable
3233              is always the last (second) argument).  */
3234           if (code == COND_EXPR)
3235             {
3236               /* Swap cond_expr by inverting the condition.  */
3237               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3238               enum tree_code invert_code = ERROR_MARK;
3239               enum tree_code cond_code = TREE_CODE (cond_expr);
3240
3241               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3242                 {
3243                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3244                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3245                 }
3246               if (invert_code != ERROR_MARK)
3247                 {
3248                   TREE_SET_CODE (cond_expr, invert_code);
3249                   swap_ssa_operands (def_stmt,
3250                                      gimple_assign_rhs2_ptr (def_stmt),
3251                                      gimple_assign_rhs3_ptr (def_stmt));
3252                 }
3253               else
3254                 {
3255                   if (dump_enabled_p ())
3256                     report_vect_op (MSG_NOTE, def_stmt,
3257                                     "detected reduction: cannot swap operands "
3258                                     "for cond_expr");
3259                   return NULL;
3260                 }
3261             }
3262           else
3263             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3264                                gimple_assign_rhs2_ptr (def_stmt));
3265
3266           if (dump_enabled_p ())
3267             report_vect_op (MSG_NOTE, def_stmt,
3268                             "detected reduction: need to swap operands: ");
3269
3270           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3271             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3272         }
3273       else
3274         {
3275           if (dump_enabled_p ())
3276             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3277         }
3278
3279       return def_stmt;
3280     }
3281
3282   /* Try to find SLP reduction chain.  */
3283   if (! nested_in_vect_loop
3284       && code != COND_EXPR
3285       && orig_code != MINUS_EXPR
3286       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3287     {
3288       if (dump_enabled_p ())
3289         report_vect_op (MSG_NOTE, def_stmt,
3290                         "reduction: detected reduction chain: ");
3291
3292       return def_stmt;
3293     }
3294
3295   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3296   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3297   while (first)
3298     {
3299       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3300       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3301       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3302       first = next;
3303     }
3304
3305   /* Look for the expression computing loop_arg from loop PHI result.  */
3306   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3307                             code))
3308     return def_stmt;
3309
3310   if (dump_enabled_p ())
3311     {
3312       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3313                       "reduction: unknown pattern: ");
3314     }
3315
3316   return NULL;
3317 }
3318
3319 /* Wrapper around vect_is_simple_reduction, which will modify code
3320    in-place if it enables detection of more reductions.  Arguments
3321    as there.  */
3322
3323 gimple *
3324 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3325                              bool *double_reduc,
3326                              bool need_wrapping_integral_overflow)
3327 {
3328   enum vect_reduction_type v_reduc_type;
3329   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3330                                           need_wrapping_integral_overflow,
3331                                           &v_reduc_type);
3332   if (def)
3333     {
3334       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3335       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3336       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3337       reduc_def_info = vinfo_for_stmt (def);
3338       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3339       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3340     }
3341   return def;
3342 }
3343
3344 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3345 int
3346 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3347                              int *peel_iters_epilogue,
3348                              stmt_vector_for_cost *scalar_cost_vec,
3349                              stmt_vector_for_cost *prologue_cost_vec,
3350                              stmt_vector_for_cost *epilogue_cost_vec)
3351 {
3352   int retval = 0;
3353   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3354
3355   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3356     {
3357       *peel_iters_epilogue = assumed_vf / 2;
3358       if (dump_enabled_p ())
3359         dump_printf_loc (MSG_NOTE, vect_location,
3360                          "cost model: epilogue peel iters set to vf/2 "
3361                          "because loop iterations are unknown .\n");
3362
3363       /* If peeled iterations are known but number of scalar loop
3364          iterations are unknown, count a taken branch per peeled loop.  */
3365       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3366                                  NULL, 0, vect_prologue);
3367       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3368                                  NULL, 0, vect_epilogue);
3369     }
3370   else
3371     {
3372       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3373       peel_iters_prologue = niters < peel_iters_prologue ?
3374                             niters : peel_iters_prologue;
3375       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3376       /* If we need to peel for gaps, but no peeling is required, we have to
3377          peel VF iterations.  */
3378       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3379         *peel_iters_epilogue = assumed_vf;
3380     }
3381
3382   stmt_info_for_cost *si;
3383   int j;
3384   if (peel_iters_prologue)
3385     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3386         {
3387           stmt_vec_info stmt_info
3388             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3389           retval += record_stmt_cost (prologue_cost_vec,
3390                                       si->count * peel_iters_prologue,
3391                                       si->kind, stmt_info, si->misalign,
3392                                       vect_prologue);
3393         }
3394   if (*peel_iters_epilogue)
3395     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3396         {
3397           stmt_vec_info stmt_info
3398             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3399           retval += record_stmt_cost (epilogue_cost_vec,
3400                                       si->count * *peel_iters_epilogue,
3401                                       si->kind, stmt_info, si->misalign,
3402                                       vect_epilogue);
3403         }
3404
3405   return retval;
3406 }
3407
3408 /* Function vect_estimate_min_profitable_iters
3409
3410    Return the number of iterations required for the vector version of the
3411    loop to be profitable relative to the cost of the scalar version of the
3412    loop.
3413
3414    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3415    of iterations for vectorization.  -1 value means loop vectorization
3416    is not profitable.  This returned value may be used for dynamic
3417    profitability check.
3418
3419    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3420    for static check against estimated number of iterations.  */
3421
3422 static void
3423 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3424                                     int *ret_min_profitable_niters,
3425                                     int *ret_min_profitable_estimate)
3426 {
3427   int min_profitable_iters;
3428   int min_profitable_estimate;
3429   int peel_iters_prologue;
3430   int peel_iters_epilogue;
3431   unsigned vec_inside_cost = 0;
3432   int vec_outside_cost = 0;
3433   unsigned vec_prologue_cost = 0;
3434   unsigned vec_epilogue_cost = 0;
3435   int scalar_single_iter_cost = 0;
3436   int scalar_outside_cost = 0;
3437   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3438   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3439   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3440
3441   /* Cost model disabled.  */
3442   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3443     {
3444       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3445       *ret_min_profitable_niters = 0;
3446       *ret_min_profitable_estimate = 0;
3447       return;
3448     }
3449
3450   /* Requires loop versioning tests to handle misalignment.  */
3451   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3452     {
3453       /*  FIXME: Make cost depend on complexity of individual check.  */
3454       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3455       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3456                             vect_prologue);
3457       dump_printf (MSG_NOTE,
3458                    "cost model: Adding cost of checks for loop "
3459                    "versioning to treat misalignment.\n");
3460     }
3461
3462   /* Requires loop versioning with alias checks.  */
3463   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3464     {
3465       /*  FIXME: Make cost depend on complexity of individual check.  */
3466       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3467       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3468                             vect_prologue);
3469       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3470       if (len)
3471         /* Count LEN - 1 ANDs and LEN comparisons.  */
3472         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3473                               NULL, 0, vect_prologue);
3474       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3475       if (len)
3476         {
3477           /* Count LEN - 1 ANDs and LEN comparisons.  */
3478           unsigned int nstmts = len * 2 - 1;
3479           /* +1 for each bias that needs adding.  */
3480           for (unsigned int i = 0; i < len; ++i)
3481             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3482               nstmts += 1;
3483           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3484                                 NULL, 0, vect_prologue);
3485         }
3486       dump_printf (MSG_NOTE,
3487                    "cost model: Adding cost of checks for loop "
3488                    "versioning aliasing.\n");
3489     }
3490
3491   /* Requires loop versioning with niter checks.  */
3492   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3493     {
3494       /*  FIXME: Make cost depend on complexity of individual check.  */
3495       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3496                             vect_prologue);
3497       dump_printf (MSG_NOTE,
3498                    "cost model: Adding cost of checks for loop "
3499                    "versioning niters.\n");
3500     }
3501
3502   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3503     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3504                           vect_prologue);
3505
3506   /* Count statements in scalar loop.  Using this as scalar cost for a single
3507      iteration for now.
3508
3509      TODO: Add outer loop support.
3510
3511      TODO: Consider assigning different costs to different scalar
3512      statements.  */
3513
3514   scalar_single_iter_cost
3515     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3516
3517   /* Add additional cost for the peeled instructions in prologue and epilogue
3518      loop.  (For fully-masked loops there will be no peeling.)
3519
3520      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3521      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3522
3523      TODO: Build an expression that represents peel_iters for prologue and
3524      epilogue to be used in a run-time test.  */
3525
3526   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3527     {
3528       peel_iters_prologue = 0;
3529       peel_iters_epilogue = 0;
3530
3531       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3532         {
3533           /* We need to peel exactly one iteration.  */
3534           peel_iters_epilogue += 1;
3535           stmt_info_for_cost *si;
3536           int j;
3537           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3538                             j, si)
3539             {
3540               struct _stmt_vec_info *stmt_info
3541                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3542               (void) add_stmt_cost (target_cost_data, si->count,
3543                                     si->kind, stmt_info, si->misalign,
3544                                     vect_epilogue);
3545             }
3546         }
3547     }
3548   else if (npeel < 0)
3549     {
3550       peel_iters_prologue = assumed_vf / 2;
3551       dump_printf (MSG_NOTE, "cost model: "
3552                    "prologue peel iters set to vf/2.\n");
3553
3554       /* If peeling for alignment is unknown, loop bound of main loop becomes
3555          unknown.  */
3556       peel_iters_epilogue = assumed_vf / 2;
3557       dump_printf (MSG_NOTE, "cost model: "
3558                    "epilogue peel iters set to vf/2 because "
3559                    "peeling for alignment is unknown.\n");
3560
3561       /* If peeled iterations are unknown, count a taken branch and a not taken
3562          branch per peeled loop. Even if scalar loop iterations are known,
3563          vector iterations are not known since peeled prologue iterations are
3564          not known. Hence guards remain the same.  */
3565       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3566                             NULL, 0, vect_prologue);
3567       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3568                             NULL, 0, vect_prologue);
3569       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3570                             NULL, 0, vect_epilogue);
3571       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3572                             NULL, 0, vect_epilogue);
3573       stmt_info_for_cost *si;
3574       int j;
3575       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3576         {
3577           struct _stmt_vec_info *stmt_info
3578             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3579           (void) add_stmt_cost (target_cost_data,
3580                                 si->count * peel_iters_prologue,
3581                                 si->kind, stmt_info, si->misalign,
3582                                 vect_prologue);
3583           (void) add_stmt_cost (target_cost_data,
3584                                 si->count * peel_iters_epilogue,
3585                                 si->kind, stmt_info, si->misalign,
3586                                 vect_epilogue);
3587         }
3588     }
3589   else
3590     {
3591       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3592       stmt_info_for_cost *si;
3593       int j;
3594       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3595
3596       prologue_cost_vec.create (2);
3597       epilogue_cost_vec.create (2);
3598       peel_iters_prologue = npeel;
3599
3600       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3601                                           &peel_iters_epilogue,
3602                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3603                                             (loop_vinfo),
3604                                           &prologue_cost_vec,
3605                                           &epilogue_cost_vec);
3606
3607       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3608         {
3609           struct _stmt_vec_info *stmt_info
3610             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3611           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3612                                 si->misalign, vect_prologue);
3613         }
3614
3615       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3616         {
3617           struct _stmt_vec_info *stmt_info
3618             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3619           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3620                                 si->misalign, vect_epilogue);
3621         }
3622
3623       prologue_cost_vec.release ();
3624       epilogue_cost_vec.release ();
3625     }
3626
3627   /* FORNOW: The scalar outside cost is incremented in one of the
3628      following ways:
3629
3630      1. The vectorizer checks for alignment and aliasing and generates
3631      a condition that allows dynamic vectorization.  A cost model
3632      check is ANDED with the versioning condition.  Hence scalar code
3633      path now has the added cost of the versioning check.
3634
3635        if (cost > th & versioning_check)
3636          jmp to vector code
3637
3638      Hence run-time scalar is incremented by not-taken branch cost.
3639
3640      2. The vectorizer then checks if a prologue is required.  If the
3641      cost model check was not done before during versioning, it has to
3642      be done before the prologue check.
3643
3644        if (cost <= th)
3645          prologue = scalar_iters
3646        if (prologue == 0)
3647          jmp to vector code
3648        else
3649          execute prologue
3650        if (prologue == num_iters)
3651          go to exit
3652
3653      Hence the run-time scalar cost is incremented by a taken branch,
3654      plus a not-taken branch, plus a taken branch cost.
3655
3656      3. The vectorizer then checks if an epilogue is required.  If the
3657      cost model check was not done before during prologue check, it
3658      has to be done with the epilogue check.
3659
3660        if (prologue == 0)
3661          jmp to vector code
3662        else
3663          execute prologue
3664        if (prologue == num_iters)
3665          go to exit
3666        vector code:
3667          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3668            jmp to epilogue
3669
3670      Hence the run-time scalar cost should be incremented by 2 taken
3671      branches.
3672
3673      TODO: The back end may reorder the BBS's differently and reverse
3674      conditions/branch directions.  Change the estimates below to
3675      something more reasonable.  */
3676
3677   /* If the number of iterations is known and we do not do versioning, we can
3678      decide whether to vectorize at compile time.  Hence the scalar version
3679      do not carry cost model guard costs.  */
3680   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3681       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3682     {
3683       /* Cost model check occurs at versioning.  */
3684       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3685         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3686       else
3687         {
3688           /* Cost model check occurs at prologue generation.  */
3689           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3690             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3691               + vect_get_stmt_cost (cond_branch_not_taken);
3692           /* Cost model check occurs at epilogue generation.  */
3693           else
3694             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3695         }
3696     }
3697
3698   /* Complete the target-specific cost calculations.  */
3699   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3700                &vec_inside_cost, &vec_epilogue_cost);
3701
3702   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3703
3704   if (dump_enabled_p ())
3705     {
3706       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3707       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3708                    vec_inside_cost);
3709       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3710                    vec_prologue_cost);
3711       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3712                    vec_epilogue_cost);
3713       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3714                    scalar_single_iter_cost);
3715       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3716                    scalar_outside_cost);
3717       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3718                    vec_outside_cost);
3719       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3720                    peel_iters_prologue);
3721       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3722                    peel_iters_epilogue);
3723     }
3724
3725   /* Calculate number of iterations required to make the vector version
3726      profitable, relative to the loop bodies only.  The following condition
3727      must hold true:
3728      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3729      where
3730      SIC = scalar iteration cost, VIC = vector iteration cost,
3731      VOC = vector outside cost, VF = vectorization factor,
3732      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3733      SOC = scalar outside cost for run time cost model check.  */
3734
3735   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3736     {
3737       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3738                               * assumed_vf
3739                               - vec_inside_cost * peel_iters_prologue
3740                               - vec_inside_cost * peel_iters_epilogue);
3741       if (min_profitable_iters <= 0)
3742         min_profitable_iters = 0;
3743       else
3744         {
3745           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3746                                    - vec_inside_cost);
3747
3748           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3749               <= (((int) vec_inside_cost * min_profitable_iters)
3750                   + (((int) vec_outside_cost - scalar_outside_cost)
3751                      * assumed_vf)))
3752             min_profitable_iters++;
3753         }
3754     }
3755   /* vector version will never be profitable.  */
3756   else
3757     {
3758       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3759         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3760                     "did not happen for a simd loop");
3761
3762       if (dump_enabled_p ())
3763         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3764                          "cost model: the vector iteration cost = %d "
3765                          "divided by the scalar iteration cost = %d "
3766                          "is greater or equal to the vectorization factor = %d"
3767                          ".\n",
3768                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3769       *ret_min_profitable_niters = -1;
3770       *ret_min_profitable_estimate = -1;
3771       return;
3772     }
3773
3774   dump_printf (MSG_NOTE,
3775                "  Calculated minimum iters for profitability: %d\n",
3776                min_profitable_iters);
3777
3778   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3779       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3780     /* We want the vectorized loop to execute at least once.  */
3781     min_profitable_iters = assumed_vf + peel_iters_prologue;
3782
3783   if (dump_enabled_p ())
3784     dump_printf_loc (MSG_NOTE, vect_location,
3785                      "  Runtime profitability threshold = %d\n",
3786                      min_profitable_iters);
3787
3788   *ret_min_profitable_niters = min_profitable_iters;
3789
3790   /* Calculate number of iterations required to make the vector version
3791      profitable, relative to the loop bodies only.
3792
3793      Non-vectorized variant is SIC * niters and it must win over vector
3794      variant on the expected loop trip count.  The following condition must hold true:
3795      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3796
3797   if (vec_outside_cost <= 0)
3798     min_profitable_estimate = 0;
3799   else
3800     {
3801       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3802                                  * assumed_vf
3803                                  - vec_inside_cost * peel_iters_prologue
3804                                  - vec_inside_cost * peel_iters_epilogue)
3805                                  / ((scalar_single_iter_cost * assumed_vf)
3806                                    - vec_inside_cost);
3807     }
3808   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3809   if (dump_enabled_p ())
3810     dump_printf_loc (MSG_NOTE, vect_location,
3811                      "  Static estimate profitability threshold = %d\n",
3812                      min_profitable_estimate);
3813
3814   *ret_min_profitable_estimate = min_profitable_estimate;
3815 }
3816
3817 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3818    vector elements (not bits) for a vector with NELT elements.  */
3819 static void
3820 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3821                               vec_perm_builder *sel)
3822 {
3823   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3824      by vec_perm_indices.  */
3825   sel->new_vector (nelt, 1, 3);
3826   for (unsigned int i = 0; i < 3; i++)
3827     sel->quick_push (i + offset);
3828 }
3829
3830 /* Checks whether the target supports whole-vector shifts for vectors of mode
3831    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3832    it supports vec_perm_const with masks for all necessary shift amounts.  */
3833 static bool
3834 have_whole_vector_shift (machine_mode mode)
3835 {
3836   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3837     return true;
3838
3839   /* Variable-length vectors should be handled via the optab.  */
3840   unsigned int nelt;
3841   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3842     return false;
3843
3844   vec_perm_builder sel;
3845   vec_perm_indices indices;
3846   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3847     {
3848       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3849       indices.new_vector (sel, 2, nelt);
3850       if (!can_vec_perm_const_p (mode, indices, false))
3851         return false;
3852     }
3853   return true;
3854 }
3855
3856 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3857    functions. Design better to avoid maintenance issues.  */
3858
3859 /* Function vect_model_reduction_cost.
3860
3861    Models cost for a reduction operation, including the vector ops
3862    generated within the strip-mine loop, the initial definition before
3863    the loop, and the epilogue code that must be generated.  */
3864
3865 static void
3866 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3867                            int ncopies, stmt_vector_for_cost *cost_vec)
3868 {
3869   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3870   enum tree_code code;
3871   optab optab;
3872   tree vectype;
3873   gimple *orig_stmt;
3874   machine_mode mode;
3875   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3876   struct loop *loop = NULL;
3877
3878   if (loop_vinfo)
3879     loop = LOOP_VINFO_LOOP (loop_vinfo);
3880
3881   /* Condition reductions generate two reductions in the loop.  */
3882   vect_reduction_type reduction_type
3883     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3884   if (reduction_type == COND_REDUCTION)
3885     ncopies *= 2;
3886
3887   vectype = STMT_VINFO_VECTYPE (stmt_info);
3888   mode = TYPE_MODE (vectype);
3889   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3890
3891   if (!orig_stmt)
3892     orig_stmt = STMT_VINFO_STMT (stmt_info);
3893
3894   code = gimple_assign_rhs_code (orig_stmt);
3895
3896   if (reduction_type == EXTRACT_LAST_REDUCTION
3897       || reduction_type == FOLD_LEFT_REDUCTION)
3898     {
3899       /* No extra instructions needed in the prologue.  */
3900       prologue_cost = 0;
3901
3902       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3903         /* Count one reduction-like operation per vector.  */
3904         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3905                                         stmt_info, 0, vect_body);
3906       else
3907         {
3908           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3909           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3910           inside_cost = record_stmt_cost (cost_vec, nelements,
3911                                           vec_to_scalar, stmt_info, 0,
3912                                           vect_body);
3913           inside_cost += record_stmt_cost (cost_vec, nelements,
3914                                            scalar_stmt, stmt_info, 0,
3915                                            vect_body);
3916         }
3917     }
3918   else
3919     {
3920       /* Add in cost for initial definition.
3921          For cond reduction we have four vectors: initial index, step,
3922          initial result of the data reduction, initial value of the index
3923          reduction.  */
3924       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3925       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3926                                          scalar_to_vec, stmt_info, 0,
3927                                          vect_prologue);
3928
3929       /* Cost of reduction op inside loop.  */
3930       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3931                                       stmt_info, 0, vect_body);
3932     }
3933
3934   /* Determine cost of epilogue code.
3935
3936      We have a reduction operator that will reduce the vector in one statement.
3937      Also requires scalar extract.  */
3938
3939   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3940     {
3941       if (reduc_fn != IFN_LAST)
3942         {
3943           if (reduction_type == COND_REDUCTION)
3944             {
3945               /* An EQ stmt and an COND_EXPR stmt.  */
3946               epilogue_cost += record_stmt_cost (cost_vec, 2,
3947                                                  vector_stmt, stmt_info, 0,
3948                                                  vect_epilogue);
3949               /* Reduction of the max index and a reduction of the found
3950                  values.  */
3951               epilogue_cost += record_stmt_cost (cost_vec, 2,
3952                                                  vec_to_scalar, stmt_info, 0,
3953                                                  vect_epilogue);
3954               /* A broadcast of the max value.  */
3955               epilogue_cost += record_stmt_cost (cost_vec, 1,
3956                                                  scalar_to_vec, stmt_info, 0,
3957                                                  vect_epilogue);
3958             }
3959           else
3960             {
3961               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3962                                                  stmt_info, 0, vect_epilogue);
3963               epilogue_cost += record_stmt_cost (cost_vec, 1,
3964                                                  vec_to_scalar, stmt_info, 0,
3965                                                  vect_epilogue);
3966             }
3967         }
3968       else if (reduction_type == COND_REDUCTION)
3969         {
3970           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3971           /* Extraction of scalar elements.  */
3972           epilogue_cost += record_stmt_cost (cost_vec,
3973                                              2 * estimated_nunits,
3974                                              vec_to_scalar, stmt_info, 0,
3975                                              vect_epilogue);
3976           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3977           epilogue_cost += record_stmt_cost (cost_vec,
3978                                              2 * estimated_nunits - 3,
3979                                              scalar_stmt, stmt_info, 0,
3980                                              vect_epilogue);
3981         }
3982       else if (reduction_type == EXTRACT_LAST_REDUCTION
3983                || reduction_type == FOLD_LEFT_REDUCTION)
3984         /* No extra instructions need in the epilogue.  */
3985         ;
3986       else
3987         {
3988           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3989           tree bitsize =
3990             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3991           int element_bitsize = tree_to_uhwi (bitsize);
3992           int nelements = vec_size_in_bits / element_bitsize;
3993
3994           if (code == COND_EXPR)
3995             code = MAX_EXPR;
3996
3997           optab = optab_for_tree_code (code, vectype, optab_default);
3998
3999           /* We have a whole vector shift available.  */
4000           if (optab != unknown_optab
4001               && VECTOR_MODE_P (mode)
4002               && optab_handler (optab, mode) != CODE_FOR_nothing
4003               && have_whole_vector_shift (mode))
4004             {
4005               /* Final reduction via vector shifts and the reduction operator.
4006                  Also requires scalar extract.  */
4007               epilogue_cost += record_stmt_cost (cost_vec,
4008                                                  exact_log2 (nelements) * 2,
4009                                                  vector_stmt, stmt_info, 0,
4010                                                  vect_epilogue);
4011               epilogue_cost += record_stmt_cost (cost_vec, 1,
4012                                                  vec_to_scalar, stmt_info, 0,
4013                                                  vect_epilogue);
4014             }
4015           else
4016             /* Use extracts and reduction op for final reduction.  For N
4017                elements, we have N extracts and N-1 reduction ops.  */
4018             epilogue_cost += record_stmt_cost (cost_vec,
4019                                                nelements + nelements - 1,
4020                                                vector_stmt, stmt_info, 0,
4021                                                vect_epilogue);
4022         }
4023     }
4024
4025   if (dump_enabled_p ())
4026     dump_printf (MSG_NOTE,
4027                  "vect_model_reduction_cost: inside_cost = %d, "
4028                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4029                  prologue_cost, epilogue_cost);
4030 }
4031
4032
4033 /* Function vect_model_induction_cost.
4034
4035    Models cost for induction operations.  */
4036
4037 static void
4038 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4039                            stmt_vector_for_cost *cost_vec)
4040 {
4041   unsigned inside_cost, prologue_cost;
4042
4043   if (PURE_SLP_STMT (stmt_info))
4044     return;
4045
4046   /* loop cost for vec_loop.  */
4047   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4048                                   stmt_info, 0, vect_body);
4049
4050   /* prologue cost for vec_init and vec_step.  */
4051   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4052                                     stmt_info, 0, vect_prologue);
4053
4054   if (dump_enabled_p ())
4055     dump_printf_loc (MSG_NOTE, vect_location,
4056                      "vect_model_induction_cost: inside_cost = %d, "
4057                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4058 }
4059
4060
4061
4062 /* Function get_initial_def_for_reduction
4063
4064    Input:
4065    STMT - a stmt that performs a reduction operation in the loop.
4066    INIT_VAL - the initial value of the reduction variable
4067
4068    Output:
4069    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4070         of the reduction (used for adjusting the epilog - see below).
4071    Return a vector variable, initialized according to the operation that STMT
4072         performs. This vector will be used as the initial value of the
4073         vector of partial results.
4074
4075    Option1 (adjust in epilog): Initialize the vector as follows:
4076      add/bit or/xor:    [0,0,...,0,0]
4077      mult/bit and:      [1,1,...,1,1]
4078      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4079    and when necessary (e.g. add/mult case) let the caller know
4080    that it needs to adjust the result by init_val.
4081
4082    Option2: Initialize the vector as follows:
4083      add/bit or/xor:    [init_val,0,0,...,0]
4084      mult/bit and:      [init_val,1,1,...,1]
4085      min/max/cond_expr: [init_val,init_val,...,init_val]
4086    and no adjustments are needed.
4087
4088    For example, for the following code:
4089
4090    s = init_val;
4091    for (i=0;i<n;i++)
4092      s = s + a[i];
4093
4094    STMT is 's = s + a[i]', and the reduction variable is 's'.
4095    For a vector of 4 units, we want to return either [0,0,0,init_val],
4096    or [0,0,0,0] and let the caller know that it needs to adjust
4097    the result at the end by 'init_val'.
4098
4099    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4100    initialization vector is simpler (same element in all entries), if
4101    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4102
4103    A cost model should help decide between these two schemes.  */
4104
4105 tree
4106 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4107                                tree *adjustment_def)
4108 {
4109   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4110   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4111   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4112   tree scalar_type = TREE_TYPE (init_val);
4113   tree vectype = get_vectype_for_scalar_type (scalar_type);
4114   enum tree_code code = gimple_assign_rhs_code (stmt);
4115   tree def_for_init;
4116   tree init_def;
4117   bool nested_in_vect_loop = false;
4118   REAL_VALUE_TYPE real_init_val = dconst0;
4119   int int_init_val = 0;
4120   gimple *def_stmt = NULL;
4121   gimple_seq stmts = NULL;
4122
4123   gcc_assert (vectype);
4124
4125   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4126               || SCALAR_FLOAT_TYPE_P (scalar_type));
4127
4128   if (nested_in_vect_loop_p (loop, stmt))
4129     nested_in_vect_loop = true;
4130   else
4131     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4132
4133   /* In case of double reduction we only create a vector variable to be put
4134      in the reduction phi node.  The actual statement creation is done in
4135      vect_create_epilog_for_reduction.  */
4136   if (adjustment_def && nested_in_vect_loop
4137       && TREE_CODE (init_val) == SSA_NAME
4138       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4139       && gimple_code (def_stmt) == GIMPLE_PHI
4140       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4141       && vinfo_for_stmt (def_stmt)
4142       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4143           == vect_double_reduction_def)
4144     {
4145       *adjustment_def = NULL;
4146       return vect_create_destination_var (init_val, vectype);
4147     }
4148
4149   vect_reduction_type reduction_type
4150     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4151
4152   /* In case of a nested reduction do not use an adjustment def as
4153      that case is not supported by the epilogue generation correctly
4154      if ncopies is not one.  */
4155   if (adjustment_def && nested_in_vect_loop)
4156     {
4157       *adjustment_def = NULL;
4158       return vect_get_vec_def_for_operand (init_val, stmt);
4159     }
4160
4161   switch (code)
4162     {
4163     case WIDEN_SUM_EXPR:
4164     case DOT_PROD_EXPR:
4165     case SAD_EXPR:
4166     case PLUS_EXPR:
4167     case MINUS_EXPR:
4168     case BIT_IOR_EXPR:
4169     case BIT_XOR_EXPR:
4170     case MULT_EXPR:
4171     case BIT_AND_EXPR:
4172       {
4173         /* ADJUSTMENT_DEF is NULL when called from
4174            vect_create_epilog_for_reduction to vectorize double reduction.  */
4175         if (adjustment_def)
4176           *adjustment_def = init_val;
4177
4178         if (code == MULT_EXPR)
4179           {
4180             real_init_val = dconst1;
4181             int_init_val = 1;
4182           }
4183
4184         if (code == BIT_AND_EXPR)
4185           int_init_val = -1;
4186
4187         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4188           def_for_init = build_real (scalar_type, real_init_val);
4189         else
4190           def_for_init = build_int_cst (scalar_type, int_init_val);
4191
4192         if (adjustment_def)
4193           /* Option1: the first element is '0' or '1' as well.  */
4194           init_def = gimple_build_vector_from_val (&stmts, vectype,
4195                                                    def_for_init);
4196         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4197           {
4198             /* Option2 (variable length): the first element is INIT_VAL.  */
4199             init_def = gimple_build_vector_from_val (&stmts, vectype,
4200                                                      def_for_init);
4201             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4202                                      vectype, init_def, init_val);
4203           }
4204         else
4205           {
4206             /* Option2: the first element is INIT_VAL.  */
4207             tree_vector_builder elts (vectype, 1, 2);
4208             elts.quick_push (init_val);
4209             elts.quick_push (def_for_init);
4210             init_def = gimple_build_vector (&stmts, &elts);
4211           }
4212       }
4213       break;
4214
4215     case MIN_EXPR:
4216     case MAX_EXPR:
4217     case COND_EXPR:
4218       {
4219         if (adjustment_def)
4220           {
4221             *adjustment_def = NULL_TREE;
4222             if (reduction_type != COND_REDUCTION
4223                 && reduction_type != EXTRACT_LAST_REDUCTION)
4224               {
4225                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4226                 break;
4227               }
4228           }
4229         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4230         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4231       }
4232       break;
4233
4234     default:
4235       gcc_unreachable ();
4236     }
4237
4238   if (stmts)
4239     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4240   return init_def;
4241 }
4242
4243 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4244    NUMBER_OF_VECTORS is the number of vector defs to create.
4245    If NEUTRAL_OP is nonnull, introducing extra elements of that
4246    value will not change the result.  */
4247
4248 static void
4249 get_initial_defs_for_reduction (slp_tree slp_node,
4250                                 vec<tree> *vec_oprnds,
4251                                 unsigned int number_of_vectors,
4252                                 bool reduc_chain, tree neutral_op)
4253 {
4254   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4255   gimple *stmt = stmts[0];
4256   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4257   unsigned HOST_WIDE_INT nunits;
4258   unsigned j, number_of_places_left_in_vector;
4259   tree vector_type;
4260   tree vop;
4261   int group_size = stmts.length ();
4262   unsigned int vec_num, i;
4263   unsigned number_of_copies = 1;
4264   vec<tree> voprnds;
4265   voprnds.create (number_of_vectors);
4266   struct loop *loop;
4267   auto_vec<tree, 16> permute_results;
4268
4269   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4270
4271   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4272
4273   loop = (gimple_bb (stmt))->loop_father;
4274   gcc_assert (loop);
4275   edge pe = loop_preheader_edge (loop);
4276
4277   gcc_assert (!reduc_chain || neutral_op);
4278
4279   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4280      created vectors. It is greater than 1 if unrolling is performed.
4281
4282      For example, we have two scalar operands, s1 and s2 (e.g., group of
4283      strided accesses of size two), while NUNITS is four (i.e., four scalars
4284      of this type can be packed in a vector).  The output vector will contain
4285      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4286      will be 2).
4287
4288      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4289      vectors containing the operands.
4290
4291      For example, NUNITS is four as before, and the group size is 8
4292      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4293      {s5, s6, s7, s8}.  */
4294
4295   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4296     nunits = group_size;
4297
4298   number_of_copies = nunits * number_of_vectors / group_size;
4299
4300   number_of_places_left_in_vector = nunits;
4301   bool constant_p = true;
4302   tree_vector_builder elts (vector_type, nunits, 1);
4303   elts.quick_grow (nunits);
4304   for (j = 0; j < number_of_copies; j++)
4305     {
4306       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4307         {
4308           tree op;
4309           /* Get the def before the loop.  In reduction chain we have only
4310              one initial value.  */
4311           if ((j != (number_of_copies - 1)
4312                || (reduc_chain && i != 0))
4313               && neutral_op)
4314             op = neutral_op;
4315           else
4316             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4317
4318           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4319           number_of_places_left_in_vector--;
4320           elts[number_of_places_left_in_vector] = op;
4321           if (!CONSTANT_CLASS_P (op))
4322             constant_p = false;
4323
4324           if (number_of_places_left_in_vector == 0)
4325             {
4326               gimple_seq ctor_seq = NULL;
4327               tree init;
4328               if (constant_p && !neutral_op
4329                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4330                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4331                 /* Build the vector directly from ELTS.  */
4332                 init = gimple_build_vector (&ctor_seq, &elts);
4333               else if (neutral_op)
4334                 {
4335                   /* Build a vector of the neutral value and shift the
4336                      other elements into place.  */
4337                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4338                                                        neutral_op);
4339                   int k = nunits;
4340                   while (k > 0 && elts[k - 1] == neutral_op)
4341                     k -= 1;
4342                   while (k > 0)
4343                     {
4344                       k -= 1;
4345                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4346                                            vector_type, init, elts[k]);
4347                     }
4348                 }
4349               else
4350                 {
4351                   /* First time round, duplicate ELTS to fill the
4352                      required number of vectors, then cherry pick the
4353                      appropriate result for each iteration.  */
4354                   if (vec_oprnds->is_empty ())
4355                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4356                                               number_of_vectors,
4357                                               permute_results);
4358                   init = permute_results[number_of_vectors - j - 1];
4359                 }
4360               if (ctor_seq != NULL)
4361                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4362               voprnds.quick_push (init);
4363
4364               number_of_places_left_in_vector = nunits;
4365               elts.new_vector (vector_type, nunits, 1);
4366               elts.quick_grow (nunits);
4367               constant_p = true;
4368             }
4369         }
4370     }
4371
4372   /* Since the vectors are created in the reverse order, we should invert
4373      them.  */
4374   vec_num = voprnds.length ();
4375   for (j = vec_num; j != 0; j--)
4376     {
4377       vop = voprnds[j - 1];
4378       vec_oprnds->quick_push (vop);
4379     }
4380
4381   voprnds.release ();
4382
4383   /* In case that VF is greater than the unrolling factor needed for the SLP
4384      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4385      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4386      to replicate the vectors.  */
4387   tree neutral_vec = NULL;
4388   while (number_of_vectors > vec_oprnds->length ())
4389     {
4390       if (neutral_op)
4391         {
4392           if (!neutral_vec)
4393             {
4394               gimple_seq ctor_seq = NULL;
4395               neutral_vec = gimple_build_vector_from_val
4396                 (&ctor_seq, vector_type, neutral_op);
4397               if (ctor_seq != NULL)
4398                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4399             }
4400           vec_oprnds->quick_push (neutral_vec);
4401         }
4402       else
4403         {
4404           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4405             vec_oprnds->quick_push (vop);
4406         }
4407     }
4408 }
4409
4410
4411 /* Function vect_create_epilog_for_reduction
4412
4413    Create code at the loop-epilog to finalize the result of a reduction
4414    computation.
4415
4416    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4417      reduction statements.
4418    STMT is the scalar reduction stmt that is being vectorized.
4419    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4420      number of elements that we can fit in a vectype (nunits).  In this case
4421      we have to generate more than one vector stmt - i.e - we need to "unroll"
4422      the vector stmt by a factor VF/nunits.  For more details see documentation
4423      in vectorizable_operation.
4424    REDUC_FN is the internal function for the epilog reduction.
4425    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4426      computation.
4427    REDUC_INDEX is the index of the operand in the right hand side of the
4428      statement that is defined by REDUCTION_PHI.
4429    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4430    SLP_NODE is an SLP node containing a group of reduction statements. The
4431      first one in this group is STMT.
4432    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4433      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4434      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4435      any value of the IV in the loop.
4436    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4437    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4438      null if this is not an SLP reduction
4439
4440    This function:
4441    1. Creates the reduction def-use cycles: sets the arguments for
4442       REDUCTION_PHIS:
4443       The loop-entry argument is the vectorized initial-value of the reduction.
4444       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4445       sums.
4446    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4447       by calling the function specified by REDUC_FN if available, or by
4448       other means (whole-vector shifts or a scalar loop).
4449       The function also creates a new phi node at the loop exit to preserve
4450       loop-closed form, as illustrated below.
4451
4452      The flow at the entry to this function:
4453
4454         loop:
4455           vec_def = phi <null, null>            # REDUCTION_PHI
4456           VECT_DEF = vector_stmt                # vectorized form of STMT
4457           s_loop = scalar_stmt                  # (scalar) STMT
4458         loop_exit:
4459           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4460           use <s_out0>
4461           use <s_out0>
4462
4463      The above is transformed by this function into:
4464
4465         loop:
4466           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4467           VECT_DEF = vector_stmt                # vectorized form of STMT
4468           s_loop = scalar_stmt                  # (scalar) STMT
4469         loop_exit:
4470           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4471           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4472           v_out2 = reduce <v_out1>
4473           s_out3 = extract_field <v_out2, 0>
4474           s_out4 = adjust_result <s_out3>
4475           use <s_out4>
4476           use <s_out4>
4477 */
4478
4479 static void
4480 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4481                                   gimple *reduc_def_stmt,
4482                                   int ncopies, internal_fn reduc_fn,
4483                                   vec<gimple *> reduction_phis,
4484                                   bool double_reduc,
4485                                   slp_tree slp_node,
4486                                   slp_instance slp_node_instance,
4487                                   tree induc_val, enum tree_code induc_code,
4488                                   tree neutral_op)
4489 {
4490   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4491   stmt_vec_info prev_phi_info;
4492   tree vectype;
4493   machine_mode mode;
4494   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4495   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4496   basic_block exit_bb;
4497   tree scalar_dest;
4498   tree scalar_type;
4499   gimple *new_phi = NULL, *phi;
4500   gimple_stmt_iterator exit_gsi;
4501   tree vec_dest;
4502   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4503   gimple *epilog_stmt = NULL;
4504   enum tree_code code = gimple_assign_rhs_code (stmt);
4505   gimple *exit_phi;
4506   tree bitsize;
4507   tree adjustment_def = NULL;
4508   tree vec_initial_def = NULL;
4509   tree expr, def, initial_def = NULL;
4510   tree orig_name, scalar_result;
4511   imm_use_iterator imm_iter, phi_imm_iter;
4512   use_operand_p use_p, phi_use_p;
4513   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4514   bool nested_in_vect_loop = false;
4515   auto_vec<gimple *> new_phis;
4516   auto_vec<gimple *> inner_phis;
4517   enum vect_def_type dt = vect_unknown_def_type;
4518   int j, i;
4519   auto_vec<tree> scalar_results;
4520   unsigned int group_size = 1, k, ratio;
4521   auto_vec<tree> vec_initial_defs;
4522   auto_vec<gimple *> phis;
4523   bool slp_reduc = false;
4524   bool direct_slp_reduc;
4525   tree new_phi_result;
4526   gimple *inner_phi = NULL;
4527   tree induction_index = NULL_TREE;
4528
4529   if (slp_node)
4530     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4531
4532   if (nested_in_vect_loop_p (loop, stmt))
4533     {
4534       outer_loop = loop;
4535       loop = loop->inner;
4536       nested_in_vect_loop = true;
4537       gcc_assert (!slp_node);
4538     }
4539
4540   vectype = STMT_VINFO_VECTYPE (stmt_info);
4541   gcc_assert (vectype);
4542   mode = TYPE_MODE (vectype);
4543
4544   /* 1. Create the reduction def-use cycle:
4545      Set the arguments of REDUCTION_PHIS, i.e., transform
4546
4547         loop:
4548           vec_def = phi <null, null>            # REDUCTION_PHI
4549           VECT_DEF = vector_stmt                # vectorized form of STMT
4550           ...
4551
4552      into:
4553
4554         loop:
4555           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4556           VECT_DEF = vector_stmt                # vectorized form of STMT
4557           ...
4558
4559      (in case of SLP, do it for all the phis). */
4560
4561   /* Get the loop-entry arguments.  */
4562   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4563   if (slp_node)
4564     {
4565       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4566       vec_initial_defs.reserve (vec_num);
4567       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4568                                       &vec_initial_defs, vec_num,
4569                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4570                                       neutral_op);
4571     }
4572   else
4573     {
4574       /* Get at the scalar def before the loop, that defines the initial value
4575          of the reduction variable.  */
4576       gimple *def_stmt;
4577       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4578                                            loop_preheader_edge (loop));
4579       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4580          and we can't use zero for induc_val, use initial_def.  Similarly
4581          for REDUC_MIN and initial_def larger than the base.  */
4582       if (TREE_CODE (initial_def) == INTEGER_CST
4583           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4584               == INTEGER_INDUC_COND_REDUCTION)
4585           && !integer_zerop (induc_val)
4586           && ((induc_code == MAX_EXPR
4587                && tree_int_cst_lt (initial_def, induc_val))
4588               || (induc_code == MIN_EXPR
4589                   && tree_int_cst_lt (induc_val, initial_def))))
4590         induc_val = initial_def;
4591       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4592       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4593                                                        &adjustment_def);
4594       vec_initial_defs.create (1);
4595       vec_initial_defs.quick_push (vec_initial_def);
4596     }
4597
4598   /* Set phi nodes arguments.  */
4599   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4600     {
4601       tree vec_init_def = vec_initial_defs[i];
4602       tree def = vect_defs[i];
4603       for (j = 0; j < ncopies; j++)
4604         {
4605           if (j != 0)
4606             {
4607               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4608               if (nested_in_vect_loop)
4609                 vec_init_def
4610                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4611                                                     vec_init_def);
4612             }
4613
4614           /* Set the loop-entry arg of the reduction-phi.  */
4615
4616           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4617               == INTEGER_INDUC_COND_REDUCTION)
4618             {
4619               /* Initialise the reduction phi to zero.  This prevents initial
4620                  values of non-zero interferring with the reduction op.  */
4621               gcc_assert (ncopies == 1);
4622               gcc_assert (i == 0);
4623
4624               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4625               tree induc_val_vec
4626                 = build_vector_from_val (vec_init_def_type, induc_val);
4627
4628               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4629                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4630             }
4631           else
4632             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4633                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4634
4635           /* Set the loop-latch arg for the reduction-phi.  */
4636           if (j > 0)
4637             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4638
4639           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4640                        UNKNOWN_LOCATION);
4641
4642           if (dump_enabled_p ())
4643             {
4644               dump_printf_loc (MSG_NOTE, vect_location,
4645                                "transform reduction: created def-use cycle: ");
4646               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4647               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4648             }
4649         }
4650     }
4651
4652   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4653      which is updated with the current index of the loop for every match of
4654      the original loop's cond_expr (VEC_STMT).  This results in a vector
4655      containing the last time the condition passed for that vector lane.
4656      The first match will be a 1 to allow 0 to be used for non-matching
4657      indexes.  If there are no matches at all then the vector will be all
4658      zeroes.  */
4659   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4660     {
4661       tree indx_before_incr, indx_after_incr;
4662       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4663
4664       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4665       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4666
4667       int scalar_precision
4668         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4669       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4670       tree cr_index_vector_type = build_vector_type
4671         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4672
4673       /* First we create a simple vector induction variable which starts
4674          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4675          vector size (STEP).  */
4676
4677       /* Create a {1,2,3,...} vector.  */
4678       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4679
4680       /* Create a vector of the step value.  */
4681       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4682       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4683
4684       /* Create an induction variable.  */
4685       gimple_stmt_iterator incr_gsi;
4686       bool insert_after;
4687       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4688       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4689                  insert_after, &indx_before_incr, &indx_after_incr);
4690
4691       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4692          filled with zeros (VEC_ZERO).  */
4693
4694       /* Create a vector of 0s.  */
4695       tree zero = build_zero_cst (cr_index_scalar_type);
4696       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4697
4698       /* Create a vector phi node.  */
4699       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4700       new_phi = create_phi_node (new_phi_tree, loop->header);
4701       set_vinfo_for_stmt (new_phi,
4702                           new_stmt_vec_info (new_phi, loop_vinfo));
4703       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4704                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4705
4706       /* Now take the condition from the loops original cond_expr
4707          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4708          every match uses values from the induction variable
4709          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4710          (NEW_PHI_TREE).
4711          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4712          the new cond_expr (INDEX_COND_EXPR).  */
4713
4714       /* Duplicate the condition from vec_stmt.  */
4715       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4716
4717       /* Create a conditional, where the condition is taken from vec_stmt
4718          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4719          else is the phi (NEW_PHI_TREE).  */
4720       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4721                                      ccompare, indx_before_incr,
4722                                      new_phi_tree);
4723       induction_index = make_ssa_name (cr_index_vector_type);
4724       gimple *index_condition = gimple_build_assign (induction_index,
4725                                                      index_cond_expr);
4726       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4727       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4728                                                         loop_vinfo);
4729       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4730       set_vinfo_for_stmt (index_condition, index_vec_info);
4731
4732       /* Update the phi with the vec cond.  */
4733       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4734                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4735     }
4736
4737   /* 2. Create epilog code.
4738         The reduction epilog code operates across the elements of the vector
4739         of partial results computed by the vectorized loop.
4740         The reduction epilog code consists of:
4741
4742         step 1: compute the scalar result in a vector (v_out2)
4743         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4744         step 3: adjust the scalar result (s_out3) if needed.
4745
4746         Step 1 can be accomplished using one the following three schemes:
4747           (scheme 1) using reduc_fn, if available.
4748           (scheme 2) using whole-vector shifts, if available.
4749           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4750                      combined.
4751
4752           The overall epilog code looks like this:
4753
4754           s_out0 = phi <s_loop>         # original EXIT_PHI
4755           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4756           v_out2 = reduce <v_out1>              # step 1
4757           s_out3 = extract_field <v_out2, 0>    # step 2
4758           s_out4 = adjust_result <s_out3>       # step 3
4759
4760           (step 3 is optional, and steps 1 and 2 may be combined).
4761           Lastly, the uses of s_out0 are replaced by s_out4.  */
4762
4763
4764   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4765          v_out1 = phi <VECT_DEF>
4766          Store them in NEW_PHIS.  */
4767
4768   exit_bb = single_exit (loop)->dest;
4769   prev_phi_info = NULL;
4770   new_phis.create (vect_defs.length ());
4771   FOR_EACH_VEC_ELT (vect_defs, i, def)
4772     {
4773       for (j = 0; j < ncopies; j++)
4774         {
4775           tree new_def = copy_ssa_name (def);
4776           phi = create_phi_node (new_def, exit_bb);
4777           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4778           if (j == 0)
4779             new_phis.quick_push (phi);
4780           else
4781             {
4782               def = vect_get_vec_def_for_stmt_copy (dt, def);
4783               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4784             }
4785
4786           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4787           prev_phi_info = vinfo_for_stmt (phi);
4788         }
4789     }
4790
4791   /* The epilogue is created for the outer-loop, i.e., for the loop being
4792      vectorized.  Create exit phis for the outer loop.  */
4793   if (double_reduc)
4794     {
4795       loop = outer_loop;
4796       exit_bb = single_exit (loop)->dest;
4797       inner_phis.create (vect_defs.length ());
4798       FOR_EACH_VEC_ELT (new_phis, i, phi)
4799         {
4800           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4801           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4802           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4803                            PHI_RESULT (phi));
4804           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4805                                                             loop_vinfo));
4806           inner_phis.quick_push (phi);
4807           new_phis[i] = outer_phi;
4808           prev_phi_info = vinfo_for_stmt (outer_phi);
4809           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4810             {
4811               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4812               new_result = copy_ssa_name (PHI_RESULT (phi));
4813               outer_phi = create_phi_node (new_result, exit_bb);
4814               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4815                                PHI_RESULT (phi));
4816               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4817                                                                 loop_vinfo));
4818               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4819               prev_phi_info = vinfo_for_stmt (outer_phi);
4820             }
4821         }
4822     }
4823
4824   exit_gsi = gsi_after_labels (exit_bb);
4825
4826   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4827          (i.e. when reduc_fn is not available) and in the final adjustment
4828          code (if needed).  Also get the original scalar reduction variable as
4829          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4830          represents a reduction pattern), the tree-code and scalar-def are
4831          taken from the original stmt that the pattern-stmt (STMT) replaces.
4832          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4833          are taken from STMT.  */
4834
4835   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4836   if (!orig_stmt)
4837     {
4838       /* Regular reduction  */
4839       orig_stmt = stmt;
4840     }
4841   else
4842     {
4843       /* Reduction pattern  */
4844       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4845       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4846       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4847     }
4848
4849   code = gimple_assign_rhs_code (orig_stmt);
4850   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4851      partial results are added and not subtracted.  */
4852   if (code == MINUS_EXPR)
4853     code = PLUS_EXPR;
4854
4855   scalar_dest = gimple_assign_lhs (orig_stmt);
4856   scalar_type = TREE_TYPE (scalar_dest);
4857   scalar_results.create (group_size);
4858   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4859   bitsize = TYPE_SIZE (scalar_type);
4860
4861   /* In case this is a reduction in an inner-loop while vectorizing an outer
4862      loop - we don't need to extract a single scalar result at the end of the
4863      inner-loop (unless it is double reduction, i.e., the use of reduction is
4864      outside the outer-loop).  The final vector of partial results will be used
4865      in the vectorized outer-loop, or reduced to a scalar result at the end of
4866      the outer-loop.  */
4867   if (nested_in_vect_loop && !double_reduc)
4868     goto vect_finalize_reduction;
4869
4870   /* SLP reduction without reduction chain, e.g.,
4871      # a1 = phi <a2, a0>
4872      # b1 = phi <b2, b0>
4873      a2 = operation (a1)
4874      b2 = operation (b1)  */
4875   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4876
4877   /* True if we should implement SLP_REDUC using native reduction operations
4878      instead of scalar operations.  */
4879   direct_slp_reduc = (reduc_fn != IFN_LAST
4880                       && slp_reduc
4881                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4882
4883   /* In case of reduction chain, e.g.,
4884      # a1 = phi <a3, a0>
4885      a2 = operation (a1)
4886      a3 = operation (a2),
4887
4888      we may end up with more than one vector result.  Here we reduce them to
4889      one vector.  */
4890   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4891     {
4892       tree first_vect = PHI_RESULT (new_phis[0]);
4893       gassign *new_vec_stmt = NULL;
4894       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4895       for (k = 1; k < new_phis.length (); k++)
4896         {
4897           gimple *next_phi = new_phis[k];
4898           tree second_vect = PHI_RESULT (next_phi);
4899           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4900           new_vec_stmt = gimple_build_assign (tem, code,
4901                                               first_vect, second_vect);
4902           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4903           first_vect = tem;
4904         }
4905
4906       new_phi_result = first_vect;
4907       if (new_vec_stmt)
4908         {
4909           new_phis.truncate (0);
4910           new_phis.safe_push (new_vec_stmt);
4911         }
4912     }
4913   /* Likewise if we couldn't use a single defuse cycle.  */
4914   else if (ncopies > 1)
4915     {
4916       gcc_assert (new_phis.length () == 1);
4917       tree first_vect = PHI_RESULT (new_phis[0]);
4918       gassign *new_vec_stmt = NULL;
4919       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4920       gimple *next_phi = new_phis[0];
4921       for (int k = 1; k < ncopies; ++k)
4922         {
4923           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4924           tree second_vect = PHI_RESULT (next_phi);
4925           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4926           new_vec_stmt = gimple_build_assign (tem, code,
4927                                               first_vect, second_vect);
4928           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4929           first_vect = tem;
4930         }
4931       new_phi_result = first_vect;
4932       new_phis.truncate (0);
4933       new_phis.safe_push (new_vec_stmt);
4934     }
4935   else
4936     new_phi_result = PHI_RESULT (new_phis[0]);
4937
4938   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4939       && reduc_fn != IFN_LAST)
4940     {
4941       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4942          various data values where the condition matched and another vector
4943          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4944          need to extract the last matching index (which will be the index with
4945          highest value) and use this to index into the data vector.
4946          For the case where there were no matches, the data vector will contain
4947          all default values and the index vector will be all zeros.  */
4948
4949       /* Get various versions of the type of the vector of indexes.  */
4950       tree index_vec_type = TREE_TYPE (induction_index);
4951       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4952       tree index_scalar_type = TREE_TYPE (index_vec_type);
4953       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4954         (index_vec_type);
4955
4956       /* Get an unsigned integer version of the type of the data vector.  */
4957       int scalar_precision
4958         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4959       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4960       tree vectype_unsigned = build_vector_type
4961         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4962
4963       /* First we need to create a vector (ZERO_VEC) of zeros and another
4964          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4965          can create using a MAX reduction and then expanding.
4966          In the case where the loop never made any matches, the max index will
4967          be zero.  */
4968
4969       /* Vector of {0, 0, 0,...}.  */
4970       tree zero_vec = make_ssa_name (vectype);
4971       tree zero_vec_rhs = build_zero_cst (vectype);
4972       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4973       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4974
4975       /* Find maximum value from the vector of found indexes.  */
4976       tree max_index = make_ssa_name (index_scalar_type);
4977       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4978                                                           1, induction_index);
4979       gimple_call_set_lhs (max_index_stmt, max_index);
4980       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4981
4982       /* Vector of {max_index, max_index, max_index,...}.  */
4983       tree max_index_vec = make_ssa_name (index_vec_type);
4984       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4985                                                       max_index);
4986       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4987                                                         max_index_vec_rhs);
4988       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4989
4990       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4991          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4992          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4993          otherwise.  Only one value should match, resulting in a vector
4994          (VEC_COND) with one data value and the rest zeros.
4995          In the case where the loop never made any matches, every index will
4996          match, resulting in a vector with all data values (which will all be
4997          the default value).  */
4998
4999       /* Compare the max index vector to the vector of found indexes to find
5000          the position of the max value.  */
5001       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5002       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5003                                                       induction_index,
5004                                                       max_index_vec);
5005       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5006
5007       /* Use the compare to choose either values from the data vector or
5008          zero.  */
5009       tree vec_cond = make_ssa_name (vectype);
5010       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5011                                                    vec_compare, new_phi_result,
5012                                                    zero_vec);
5013       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5014
5015       /* Finally we need to extract the data value from the vector (VEC_COND)
5016          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5017          reduction, but because this doesn't exist, we can use a MAX reduction
5018          instead.  The data value might be signed or a float so we need to cast
5019          it first.
5020          In the case where the loop never made any matches, the data values are
5021          all identical, and so will reduce down correctly.  */
5022
5023       /* Make the matched data values unsigned.  */
5024       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5025       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5026                                        vec_cond);
5027       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5028                                                         VIEW_CONVERT_EXPR,
5029                                                         vec_cond_cast_rhs);
5030       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5031
5032       /* Reduce down to a scalar value.  */
5033       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5034       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5035                                                            1, vec_cond_cast);
5036       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5037       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5038
5039       /* Convert the reduced value back to the result type and set as the
5040          result.  */
5041       gimple_seq stmts = NULL;
5042       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5043                                data_reduc);
5044       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5045       scalar_results.safe_push (new_temp);
5046     }
5047   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5048            && reduc_fn == IFN_LAST)
5049     {
5050       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5051          idx = 0;
5052          idx_val = induction_index[0];
5053          val = data_reduc[0];
5054          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5055            if (induction_index[i] > idx_val)
5056              val = data_reduc[i], idx_val = induction_index[i];
5057          return val;  */
5058
5059       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5060       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5061       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5062       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5063       /* Enforced by vectorizable_reduction, which ensures we have target
5064          support before allowing a conditional reduction on variable-length
5065          vectors.  */
5066       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5067       tree idx_val = NULL_TREE, val = NULL_TREE;
5068       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5069         {
5070           tree old_idx_val = idx_val;
5071           tree old_val = val;
5072           idx_val = make_ssa_name (idx_eltype);
5073           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5074                                              build3 (BIT_FIELD_REF, idx_eltype,
5075                                                      induction_index,
5076                                                      bitsize_int (el_size),
5077                                                      bitsize_int (off)));
5078           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5079           val = make_ssa_name (data_eltype);
5080           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5081                                              build3 (BIT_FIELD_REF,
5082                                                      data_eltype,
5083                                                      new_phi_result,
5084                                                      bitsize_int (el_size),
5085                                                      bitsize_int (off)));
5086           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5087           if (off != 0)
5088             {
5089               tree new_idx_val = idx_val;
5090               tree new_val = val;
5091               if (off != v_size - el_size)
5092                 {
5093                   new_idx_val = make_ssa_name (idx_eltype);
5094                   epilog_stmt = gimple_build_assign (new_idx_val,
5095                                                      MAX_EXPR, idx_val,
5096                                                      old_idx_val);
5097                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5098                 }
5099               new_val = make_ssa_name (data_eltype);
5100               epilog_stmt = gimple_build_assign (new_val,
5101                                                  COND_EXPR,
5102                                                  build2 (GT_EXPR,
5103                                                          boolean_type_node,
5104                                                          idx_val,
5105                                                          old_idx_val),
5106                                                  val, old_val);
5107               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5108               idx_val = new_idx_val;
5109               val = new_val;
5110             }
5111         }
5112       /* Convert the reduced value back to the result type and set as the
5113          result.  */
5114       gimple_seq stmts = NULL;
5115       val = gimple_convert (&stmts, scalar_type, val);
5116       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5117       scalar_results.safe_push (val);
5118     }
5119
5120   /* 2.3 Create the reduction code, using one of the three schemes described
5121          above. In SLP we simply need to extract all the elements from the
5122          vector (without reducing them), so we use scalar shifts.  */
5123   else if (reduc_fn != IFN_LAST && !slp_reduc)
5124     {
5125       tree tmp;
5126       tree vec_elem_type;
5127
5128       /* Case 1:  Create:
5129          v_out2 = reduc_expr <v_out1>  */
5130
5131       if (dump_enabled_p ())
5132         dump_printf_loc (MSG_NOTE, vect_location,
5133                          "Reduce using direct vector reduction.\n");
5134
5135       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5136       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5137         {
5138           tree tmp_dest
5139             = vect_create_destination_var (scalar_dest, vec_elem_type);
5140           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5141                                                     new_phi_result);
5142           gimple_set_lhs (epilog_stmt, tmp_dest);
5143           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5144           gimple_set_lhs (epilog_stmt, new_temp);
5145           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5146
5147           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5148                                              new_temp);
5149         }
5150       else
5151         {
5152           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5153                                                     new_phi_result);
5154           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5155         }
5156
5157       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5158       gimple_set_lhs (epilog_stmt, new_temp);
5159       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5160
5161       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5162            == INTEGER_INDUC_COND_REDUCTION)
5163           && !operand_equal_p (initial_def, induc_val, 0))
5164         {
5165           /* Earlier we set the initial value to be a vector if induc_val
5166              values.  Check the result and if it is induc_val then replace
5167              with the original initial value, unless induc_val is
5168              the same as initial_def already.  */
5169           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5170                                   induc_val);
5171
5172           tmp = make_ssa_name (new_scalar_dest);
5173           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5174                                              initial_def, new_temp);
5175           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5176           new_temp = tmp;
5177         }
5178
5179       scalar_results.safe_push (new_temp);
5180     }
5181   else if (direct_slp_reduc)
5182     {
5183       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5184          with the elements for other SLP statements replaced with the
5185          neutral value.  We can then do a normal reduction on each vector.  */
5186
5187       /* Enforced by vectorizable_reduction.  */
5188       gcc_assert (new_phis.length () == 1);
5189       gcc_assert (pow2p_hwi (group_size));
5190
5191       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5192       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5193       gimple_seq seq = NULL;
5194
5195       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5196          and the same element size as VECTYPE.  */
5197       tree index = build_index_vector (vectype, 0, 1);
5198       tree index_type = TREE_TYPE (index);
5199       tree index_elt_type = TREE_TYPE (index_type);
5200       tree mask_type = build_same_sized_truth_vector_type (index_type);
5201
5202       /* Create a vector that, for each element, identifies which of
5203          the REDUC_GROUP_SIZE results should use it.  */
5204       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5205       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5206                             build_vector_from_val (index_type, index_mask));
5207
5208       /* Get a neutral vector value.  This is simply a splat of the neutral
5209          scalar value if we have one, otherwise the initial scalar value
5210          is itself a neutral value.  */
5211       tree vector_identity = NULL_TREE;
5212       if (neutral_op)
5213         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5214                                                         neutral_op);
5215       for (unsigned int i = 0; i < group_size; ++i)
5216         {
5217           /* If there's no univeral neutral value, we can use the
5218              initial scalar value from the original PHI.  This is used
5219              for MIN and MAX reduction, for example.  */
5220           if (!neutral_op)
5221             {
5222               tree scalar_value
5223                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5224                                          loop_preheader_edge (loop));
5225               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5226                                                               scalar_value);
5227             }
5228
5229           /* Calculate the equivalent of:
5230
5231              sel[j] = (index[j] == i);
5232
5233              which selects the elements of NEW_PHI_RESULT that should
5234              be included in the result.  */
5235           tree compare_val = build_int_cst (index_elt_type, i);
5236           compare_val = build_vector_from_val (index_type, compare_val);
5237           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5238                                    index, compare_val);
5239
5240           /* Calculate the equivalent of:
5241
5242              vec = seq ? new_phi_result : vector_identity;
5243
5244              VEC is now suitable for a full vector reduction.  */
5245           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5246                                    sel, new_phi_result, vector_identity);
5247
5248           /* Do the reduction and convert it to the appropriate type.  */
5249           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5250                                       TREE_TYPE (vectype), vec);
5251           scalar = gimple_convert (&seq, scalar_type, scalar);
5252           scalar_results.safe_push (scalar);
5253         }
5254       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5255     }
5256   else
5257     {
5258       bool reduce_with_shift;
5259       tree vec_temp;
5260
5261       /* COND reductions all do the final reduction with MAX_EXPR
5262          or MIN_EXPR.  */
5263       if (code == COND_EXPR)
5264         {
5265           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5266               == INTEGER_INDUC_COND_REDUCTION)
5267             code = induc_code;
5268           else
5269             code = MAX_EXPR;
5270         }
5271
5272       /* See if the target wants to do the final (shift) reduction
5273          in a vector mode of smaller size and first reduce upper/lower
5274          halves against each other.  */
5275       enum machine_mode mode1 = mode;
5276       tree vectype1 = vectype;
5277       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5278       unsigned sz1 = sz;
5279       if (!slp_reduc
5280           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5281         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5282
5283       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5284       reduce_with_shift = have_whole_vector_shift (mode1);
5285       if (!VECTOR_MODE_P (mode1))
5286         reduce_with_shift = false;
5287       else
5288         {
5289           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5290           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5291             reduce_with_shift = false;
5292         }
5293
5294       /* First reduce the vector to the desired vector size we should
5295          do shift reduction on by combining upper and lower halves.  */
5296       new_temp = new_phi_result;
5297       while (sz > sz1)
5298         {
5299           gcc_assert (!slp_reduc);
5300           sz /= 2;
5301           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5302
5303           /* The target has to make sure we support lowpart/highpart
5304              extraction, either via direct vector extract or through
5305              an integer mode punning.  */
5306           tree dst1, dst2;
5307           if (convert_optab_handler (vec_extract_optab,
5308                                      TYPE_MODE (TREE_TYPE (new_temp)),
5309                                      TYPE_MODE (vectype1))
5310               != CODE_FOR_nothing)
5311             {
5312               /* Extract sub-vectors directly once vec_extract becomes
5313                  a conversion optab.  */
5314               dst1 = make_ssa_name (vectype1);
5315               epilog_stmt
5316                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5317                                          build3 (BIT_FIELD_REF, vectype1,
5318                                                  new_temp, TYPE_SIZE (vectype1),
5319                                                  bitsize_int (0)));
5320               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5321               dst2 =  make_ssa_name (vectype1);
5322               epilog_stmt
5323                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5324                                          build3 (BIT_FIELD_REF, vectype1,
5325                                                  new_temp, TYPE_SIZE (vectype1),
5326                                                  bitsize_int (sz * BITS_PER_UNIT)));
5327               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5328             }
5329           else
5330             {
5331               /* Extract via punning to appropriately sized integer mode
5332                  vector.  */
5333               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5334                                                             1);
5335               tree etype = build_vector_type (eltype, 2);
5336               gcc_assert (convert_optab_handler (vec_extract_optab,
5337                                                  TYPE_MODE (etype),
5338                                                  TYPE_MODE (eltype))
5339                           != CODE_FOR_nothing);
5340               tree tem = make_ssa_name (etype);
5341               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5342                                                  build1 (VIEW_CONVERT_EXPR,
5343                                                          etype, new_temp));
5344               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5345               new_temp = tem;
5346               tem = make_ssa_name (eltype);
5347               epilog_stmt
5348                   = gimple_build_assign (tem, BIT_FIELD_REF,
5349                                          build3 (BIT_FIELD_REF, eltype,
5350                                                  new_temp, TYPE_SIZE (eltype),
5351                                                  bitsize_int (0)));
5352               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5353               dst1 = make_ssa_name (vectype1);
5354               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5355                                                  build1 (VIEW_CONVERT_EXPR,
5356                                                          vectype1, tem));
5357               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5358               tem = make_ssa_name (eltype);
5359               epilog_stmt
5360                   = gimple_build_assign (tem, BIT_FIELD_REF,
5361                                          build3 (BIT_FIELD_REF, eltype,
5362                                                  new_temp, TYPE_SIZE (eltype),
5363                                                  bitsize_int (sz * BITS_PER_UNIT)));
5364               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5365               dst2 =  make_ssa_name (vectype1);
5366               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5367                                                  build1 (VIEW_CONVERT_EXPR,
5368                                                          vectype1, tem));
5369               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5370             }
5371
5372           new_temp = make_ssa_name (vectype1);
5373           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5374           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5375         }
5376
5377       if (reduce_with_shift && !slp_reduc)
5378         {
5379           int element_bitsize = tree_to_uhwi (bitsize);
5380           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5381              for variable-length vectors and also requires direct target support
5382              for loop reductions.  */
5383           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5384           int nelements = vec_size_in_bits / element_bitsize;
5385           vec_perm_builder sel;
5386           vec_perm_indices indices;
5387
5388           int elt_offset;
5389
5390           tree zero_vec = build_zero_cst (vectype1);
5391           /* Case 2: Create:
5392              for (offset = nelements/2; offset >= 1; offset/=2)
5393                 {
5394                   Create:  va' = vec_shift <va, offset>
5395                   Create:  va = vop <va, va'>
5396                 }  */
5397
5398           tree rhs;
5399
5400           if (dump_enabled_p ())
5401             dump_printf_loc (MSG_NOTE, vect_location,
5402                              "Reduce using vector shifts\n");
5403
5404           mode1 = TYPE_MODE (vectype1);
5405           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5406           for (elt_offset = nelements / 2;
5407                elt_offset >= 1;
5408                elt_offset /= 2)
5409             {
5410               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5411               indices.new_vector (sel, 2, nelements);
5412               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5413               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5414                                                  new_temp, zero_vec, mask);
5415               new_name = make_ssa_name (vec_dest, epilog_stmt);
5416               gimple_assign_set_lhs (epilog_stmt, new_name);
5417               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5418
5419               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5420                                                  new_temp);
5421               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5422               gimple_assign_set_lhs (epilog_stmt, new_temp);
5423               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5424             }
5425
5426           /* 2.4  Extract the final scalar result.  Create:
5427              s_out3 = extract_field <v_out2, bitpos>  */
5428
5429           if (dump_enabled_p ())
5430             dump_printf_loc (MSG_NOTE, vect_location,
5431                              "extract scalar result\n");
5432
5433           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5434                         bitsize, bitsize_zero_node);
5435           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5436           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5437           gimple_assign_set_lhs (epilog_stmt, new_temp);
5438           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5439           scalar_results.safe_push (new_temp);
5440         }
5441       else
5442         {
5443           /* Case 3: Create:
5444              s = extract_field <v_out2, 0>
5445              for (offset = element_size;
5446                   offset < vector_size;
5447                   offset += element_size;)
5448                {
5449                  Create:  s' = extract_field <v_out2, offset>
5450                  Create:  s = op <s, s'>  // For non SLP cases
5451                }  */
5452
5453           if (dump_enabled_p ())
5454             dump_printf_loc (MSG_NOTE, vect_location,
5455                              "Reduce using scalar code.\n");
5456
5457           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5458           int element_bitsize = tree_to_uhwi (bitsize);
5459           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5460             {
5461               int bit_offset;
5462               if (gimple_code (new_phi) == GIMPLE_PHI)
5463                 vec_temp = PHI_RESULT (new_phi);
5464               else
5465                 vec_temp = gimple_assign_lhs (new_phi);
5466               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5467                                  bitsize_zero_node);
5468               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5469               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5470               gimple_assign_set_lhs (epilog_stmt, new_temp);
5471               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5472
5473               /* In SLP we don't need to apply reduction operation, so we just
5474                  collect s' values in SCALAR_RESULTS.  */
5475               if (slp_reduc)
5476                 scalar_results.safe_push (new_temp);
5477
5478               for (bit_offset = element_bitsize;
5479                    bit_offset < vec_size_in_bits;
5480                    bit_offset += element_bitsize)
5481                 {
5482                   tree bitpos = bitsize_int (bit_offset);
5483                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5484                                      bitsize, bitpos);
5485
5486                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5487                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5488                   gimple_assign_set_lhs (epilog_stmt, new_name);
5489                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5490
5491                   if (slp_reduc)
5492                     {
5493                       /* In SLP we don't need to apply reduction operation, so
5494                          we just collect s' values in SCALAR_RESULTS.  */
5495                       new_temp = new_name;
5496                       scalar_results.safe_push (new_name);
5497                     }
5498                   else
5499                     {
5500                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5501                                                          new_name, new_temp);
5502                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5503                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5504                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5505                     }
5506                 }
5507             }
5508
5509           /* The only case where we need to reduce scalar results in SLP, is
5510              unrolling.  If the size of SCALAR_RESULTS is greater than
5511              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5512              REDUC_GROUP_SIZE.  */
5513           if (slp_reduc)
5514             {
5515               tree res, first_res, new_res;
5516               gimple *new_stmt;
5517
5518               /* Reduce multiple scalar results in case of SLP unrolling.  */
5519               for (j = group_size; scalar_results.iterate (j, &res);
5520                    j++)
5521                 {
5522                   first_res = scalar_results[j % group_size];
5523                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5524                                                   first_res, res);
5525                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5526                   gimple_assign_set_lhs (new_stmt, new_res);
5527                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5528                   scalar_results[j % group_size] = new_res;
5529                 }
5530             }
5531           else
5532             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5533             scalar_results.safe_push (new_temp);
5534         }
5535
5536       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5537            == INTEGER_INDUC_COND_REDUCTION)
5538           && !operand_equal_p (initial_def, induc_val, 0))
5539         {
5540           /* Earlier we set the initial value to be a vector if induc_val
5541              values.  Check the result and if it is induc_val then replace
5542              with the original initial value, unless induc_val is
5543              the same as initial_def already.  */
5544           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5545                                   induc_val);
5546
5547           tree tmp = make_ssa_name (new_scalar_dest);
5548           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5549                                              initial_def, new_temp);
5550           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5551           scalar_results[0] = tmp;
5552         }
5553     }
5554
5555 vect_finalize_reduction:
5556
5557   if (double_reduc)
5558     loop = loop->inner;
5559
5560   /* 2.5 Adjust the final result by the initial value of the reduction
5561          variable. (When such adjustment is not needed, then
5562          'adjustment_def' is zero).  For example, if code is PLUS we create:
5563          new_temp = loop_exit_def + adjustment_def  */
5564
5565   if (adjustment_def)
5566     {
5567       gcc_assert (!slp_reduc);
5568       if (nested_in_vect_loop)
5569         {
5570           new_phi = new_phis[0];
5571           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5572           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5573           new_dest = vect_create_destination_var (scalar_dest, vectype);
5574         }
5575       else
5576         {
5577           new_temp = scalar_results[0];
5578           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5579           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5580           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5581         }
5582
5583       epilog_stmt = gimple_build_assign (new_dest, expr);
5584       new_temp = make_ssa_name (new_dest, epilog_stmt);
5585       gimple_assign_set_lhs (epilog_stmt, new_temp);
5586       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5587       if (nested_in_vect_loop)
5588         {
5589           set_vinfo_for_stmt (epilog_stmt,
5590                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5591           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5592                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5593
5594           if (!double_reduc)
5595             scalar_results.quick_push (new_temp);
5596           else
5597             scalar_results[0] = new_temp;
5598         }
5599       else
5600         scalar_results[0] = new_temp;
5601
5602       new_phis[0] = epilog_stmt;
5603     }
5604
5605   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5606           phis with new adjusted scalar results, i.e., replace use <s_out0>
5607           with use <s_out4>.
5608
5609      Transform:
5610         loop_exit:
5611           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5612           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5613           v_out2 = reduce <v_out1>
5614           s_out3 = extract_field <v_out2, 0>
5615           s_out4 = adjust_result <s_out3>
5616           use <s_out0>
5617           use <s_out0>
5618
5619      into:
5620
5621         loop_exit:
5622           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5623           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5624           v_out2 = reduce <v_out1>
5625           s_out3 = extract_field <v_out2, 0>
5626           s_out4 = adjust_result <s_out3>
5627           use <s_out4>
5628           use <s_out4> */
5629
5630
5631   /* In SLP reduction chain we reduce vector results into one vector if
5632      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5633      LHS of the last stmt in the reduction chain, since we are looking for
5634      the loop exit phi node.  */
5635   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5636     {
5637       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5638       /* Handle reduction patterns.  */
5639       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5640         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5641
5642       scalar_dest = gimple_assign_lhs (dest_stmt);
5643       group_size = 1;
5644     }
5645
5646   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5647      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5648      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5649      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5650      correspond to the first vector stmt, etc.
5651      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5652   if (group_size > new_phis.length ())
5653     {
5654       ratio = group_size / new_phis.length ();
5655       gcc_assert (!(group_size % new_phis.length ()));
5656     }
5657   else
5658     ratio = 1;
5659
5660   for (k = 0; k < group_size; k++)
5661     {
5662       if (k % ratio == 0)
5663         {
5664           epilog_stmt = new_phis[k / ratio];
5665           reduction_phi = reduction_phis[k / ratio];
5666           if (double_reduc)
5667             inner_phi = inner_phis[k / ratio];
5668         }
5669
5670       if (slp_reduc)
5671         {
5672           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5673
5674           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5675           /* SLP statements can't participate in patterns.  */
5676           gcc_assert (!orig_stmt);
5677           scalar_dest = gimple_assign_lhs (current_stmt);
5678         }
5679
5680       phis.create (3);
5681       /* Find the loop-closed-use at the loop exit of the original scalar
5682          result.  (The reduction result is expected to have two immediate uses -
5683          one at the latch block, and one at the loop exit).  */
5684       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5685         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5686             && !is_gimple_debug (USE_STMT (use_p)))
5687           phis.safe_push (USE_STMT (use_p));
5688
5689       /* While we expect to have found an exit_phi because of loop-closed-ssa
5690          form we can end up without one if the scalar cycle is dead.  */
5691
5692       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5693         {
5694           if (outer_loop)
5695             {
5696               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5697               gphi *vect_phi;
5698
5699               /* FORNOW. Currently not supporting the case that an inner-loop
5700                  reduction is not used in the outer-loop (but only outside the
5701                  outer-loop), unless it is double reduction.  */
5702               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5703                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5704                           || double_reduc);
5705
5706               if (double_reduc)
5707                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5708               else
5709                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5710               if (!double_reduc
5711                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5712                       != vect_double_reduction_def)
5713                 continue;
5714
5715               /* Handle double reduction:
5716
5717                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5718                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5719                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5720                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5721
5722                  At that point the regular reduction (stmt2 and stmt3) is
5723                  already vectorized, as well as the exit phi node, stmt4.
5724                  Here we vectorize the phi node of double reduction, stmt1, and
5725                  update all relevant statements.  */
5726
5727               /* Go through all the uses of s2 to find double reduction phi
5728                  node, i.e., stmt1 above.  */
5729               orig_name = PHI_RESULT (exit_phi);
5730               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5731                 {
5732                   stmt_vec_info use_stmt_vinfo;
5733                   stmt_vec_info new_phi_vinfo;
5734                   tree vect_phi_init, preheader_arg, vect_phi_res;
5735                   basic_block bb = gimple_bb (use_stmt);
5736                   gimple *use;
5737
5738                   /* Check that USE_STMT is really double reduction phi
5739                      node.  */
5740                   if (gimple_code (use_stmt) != GIMPLE_PHI
5741                       || gimple_phi_num_args (use_stmt) != 2
5742                       || bb->loop_father != outer_loop)
5743                     continue;
5744                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5745                   if (!use_stmt_vinfo
5746                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5747                           != vect_double_reduction_def)
5748                     continue;
5749
5750                   /* Create vector phi node for double reduction:
5751                      vs1 = phi <vs0, vs2>
5752                      vs1 was created previously in this function by a call to
5753                        vect_get_vec_def_for_operand and is stored in
5754                        vec_initial_def;
5755                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5756                      vs0 is created here.  */
5757
5758                   /* Create vector phi node.  */
5759                   vect_phi = create_phi_node (vec_initial_def, bb);
5760                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5761                                     loop_vec_info_for_loop (outer_loop));
5762                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5763
5764                   /* Create vs0 - initial def of the double reduction phi.  */
5765                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5766                                              loop_preheader_edge (outer_loop));
5767                   vect_phi_init = get_initial_def_for_reduction
5768                     (stmt, preheader_arg, NULL);
5769
5770                   /* Update phi node arguments with vs0 and vs2.  */
5771                   add_phi_arg (vect_phi, vect_phi_init,
5772                                loop_preheader_edge (outer_loop),
5773                                UNKNOWN_LOCATION);
5774                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5775                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5776                   if (dump_enabled_p ())
5777                     {
5778                       dump_printf_loc (MSG_NOTE, vect_location,
5779                                        "created double reduction phi node: ");
5780                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5781                     }
5782
5783                   vect_phi_res = PHI_RESULT (vect_phi);
5784
5785                   /* Replace the use, i.e., set the correct vs1 in the regular
5786                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5787                      loop is redundant.  */
5788                   use = reduction_phi;
5789                   for (j = 0; j < ncopies; j++)
5790                     {
5791                       edge pr_edge = loop_preheader_edge (loop);
5792                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5793                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5794                     }
5795                 }
5796             }
5797         }
5798
5799       phis.release ();
5800       if (nested_in_vect_loop)
5801         {
5802           if (double_reduc)
5803             loop = outer_loop;
5804           else
5805             continue;
5806         }
5807
5808       phis.create (3);
5809       /* Find the loop-closed-use at the loop exit of the original scalar
5810          result.  (The reduction result is expected to have two immediate uses,
5811          one at the latch block, and one at the loop exit).  For double
5812          reductions we are looking for exit phis of the outer loop.  */
5813       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5814         {
5815           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5816             {
5817               if (!is_gimple_debug (USE_STMT (use_p)))
5818                 phis.safe_push (USE_STMT (use_p));
5819             }
5820           else
5821             {
5822               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5823                 {
5824                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5825
5826                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5827                     {
5828                       if (!flow_bb_inside_loop_p (loop,
5829                                              gimple_bb (USE_STMT (phi_use_p)))
5830                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5831                         phis.safe_push (USE_STMT (phi_use_p));
5832                     }
5833                 }
5834             }
5835         }
5836
5837       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5838         {
5839           /* Replace the uses:  */
5840           orig_name = PHI_RESULT (exit_phi);
5841           scalar_result = scalar_results[k];
5842           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5843             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5844               SET_USE (use_p, scalar_result);
5845         }
5846
5847       phis.release ();
5848     }
5849 }
5850
5851 /* Return a vector of type VECTYPE that is equal to the vector select
5852    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5853    before GSI.  */
5854
5855 static tree
5856 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5857                      tree vec, tree identity)
5858 {
5859   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5860   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5861                                           mask, vec, identity);
5862   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5863   return cond;
5864 }
5865
5866 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5867    order, starting with LHS.  Insert the extraction statements before GSI and
5868    associate the new scalar SSA names with variable SCALAR_DEST.
5869    Return the SSA name for the result.  */
5870
5871 static tree
5872 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5873                        tree_code code, tree lhs, tree vector_rhs)
5874 {
5875   tree vectype = TREE_TYPE (vector_rhs);
5876   tree scalar_type = TREE_TYPE (vectype);
5877   tree bitsize = TYPE_SIZE (scalar_type);
5878   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5879   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5880
5881   for (unsigned HOST_WIDE_INT bit_offset = 0;
5882        bit_offset < vec_size_in_bits;
5883        bit_offset += element_bitsize)
5884     {
5885       tree bitpos = bitsize_int (bit_offset);
5886       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5887                          bitsize, bitpos);
5888
5889       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5890       rhs = make_ssa_name (scalar_dest, stmt);
5891       gimple_assign_set_lhs (stmt, rhs);
5892       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5893
5894       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5895       tree new_name = make_ssa_name (scalar_dest, stmt);
5896       gimple_assign_set_lhs (stmt, new_name);
5897       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5898       lhs = new_name;
5899     }
5900   return lhs;
5901 }
5902
5903 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5904    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5905    statement.  CODE is the operation performed by STMT and OPS are
5906    its scalar operands.  REDUC_INDEX is the index of the operand in
5907    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5908    implements in-order reduction, or IFN_LAST if we should open-code it.
5909    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5910    that should be used to control the operation in a fully-masked loop.  */
5911
5912 static bool
5913 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5914                                gimple **vec_stmt, slp_tree slp_node,
5915                                gimple *reduc_def_stmt,
5916                                tree_code code, internal_fn reduc_fn,
5917                                tree ops[3], tree vectype_in,
5918                                int reduc_index, vec_loop_masks *masks)
5919 {
5920   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5921   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5922   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5923   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5924   gimple *new_stmt = NULL;
5925
5926   int ncopies;
5927   if (slp_node)
5928     ncopies = 1;
5929   else
5930     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5931
5932   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5933   gcc_assert (ncopies == 1);
5934   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5935   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5936   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5937               == FOLD_LEFT_REDUCTION);
5938
5939   if (slp_node)
5940     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5941                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5942
5943   tree op0 = ops[1 - reduc_index];
5944
5945   int group_size = 1;
5946   gimple *scalar_dest_def;
5947   auto_vec<tree> vec_oprnds0;
5948   if (slp_node)
5949     {
5950       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5951       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5952       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5953     }
5954   else
5955     {
5956       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5957       vec_oprnds0.create (1);
5958       vec_oprnds0.quick_push (loop_vec_def0);
5959       scalar_dest_def = stmt;
5960     }
5961
5962   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5963   tree scalar_type = TREE_TYPE (scalar_dest);
5964   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5965
5966   int vec_num = vec_oprnds0.length ();
5967   gcc_assert (vec_num == 1 || slp_node);
5968   tree vec_elem_type = TREE_TYPE (vectype_out);
5969   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5970
5971   tree vector_identity = NULL_TREE;
5972   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5973     vector_identity = build_zero_cst (vectype_out);
5974
5975   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5976   int i;
5977   tree def0;
5978   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5979     {
5980       tree mask = NULL_TREE;
5981       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5982         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5983
5984       /* Handle MINUS by adding the negative.  */
5985       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5986         {
5987           tree negated = make_ssa_name (vectype_out);
5988           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5989           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5990           def0 = negated;
5991         }
5992
5993       if (mask)
5994         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5995                                     vector_identity);
5996
5997       /* On the first iteration the input is simply the scalar phi
5998          result, and for subsequent iterations it is the output of
5999          the preceding operation.  */
6000       if (reduc_fn != IFN_LAST)
6001         {
6002           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6003           /* For chained SLP reductions the output of the previous reduction
6004              operation serves as the input of the next. For the final statement
6005              the output cannot be a temporary - we reuse the original
6006              scalar destination of the last statement.  */
6007           if (i != vec_num - 1)
6008             {
6009               gimple_set_lhs (new_stmt, scalar_dest_var);
6010               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6011               gimple_set_lhs (new_stmt, reduc_var);
6012             }
6013         }
6014       else
6015         {
6016           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6017                                              reduc_var, def0);
6018           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6019           /* Remove the statement, so that we can use the same code paths
6020              as for statements that we've just created.  */
6021           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6022           gsi_remove (&tmp_gsi, false);
6023         }
6024
6025       if (i == vec_num - 1)
6026         {
6027           gimple_set_lhs (new_stmt, scalar_dest);
6028           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6029         }
6030       else
6031         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6032
6033       if (slp_node)
6034         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6035     }
6036
6037   if (!slp_node)
6038     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6039
6040   return true;
6041 }
6042
6043 /* Function is_nonwrapping_integer_induction.
6044
6045    Check if STMT (which is part of loop LOOP) both increments and
6046    does not cause overflow.  */
6047
6048 static bool
6049 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6050 {
6051   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6052   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6053   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6054   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6055   widest_int ni, max_loop_value, lhs_max;
6056   bool overflow = false;
6057
6058   /* Make sure the loop is integer based.  */
6059   if (TREE_CODE (base) != INTEGER_CST
6060       || TREE_CODE (step) != INTEGER_CST)
6061     return false;
6062
6063   /* Check that the max size of the loop will not wrap.  */
6064
6065   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6066     return true;
6067
6068   if (! max_stmt_executions (loop, &ni))
6069     return false;
6070
6071   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6072                             &overflow);
6073   if (overflow)
6074     return false;
6075
6076   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6077                             TYPE_SIGN (lhs_type), &overflow);
6078   if (overflow)
6079     return false;
6080
6081   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6082           <= TYPE_PRECISION (lhs_type));
6083 }
6084
6085 /* Function vectorizable_reduction.
6086
6087    Check if STMT performs a reduction operation that can be vectorized.
6088    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6089    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6090    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6091
6092    This function also handles reduction idioms (patterns) that have been
6093    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6094    of this form:
6095      X = pattern_expr (arg0, arg1, ..., X)
6096    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6097    sequence that had been detected and replaced by the pattern-stmt (STMT).
6098
6099    This function also handles reduction of condition expressions, for example:
6100      for (int i = 0; i < N; i++)
6101        if (a[i] < value)
6102          last = a[i];
6103    This is handled by vectorising the loop and creating an additional vector
6104    containing the loop indexes for which "a[i] < value" was true.  In the
6105    function epilogue this is reduced to a single max value and then used to
6106    index into the vector of results.
6107
6108    In some cases of reduction patterns, the type of the reduction variable X is
6109    different than the type of the other arguments of STMT.
6110    In such cases, the vectype that is used when transforming STMT into a vector
6111    stmt is different than the vectype that is used to determine the
6112    vectorization factor, because it consists of a different number of elements
6113    than the actual number of elements that are being operated upon in parallel.
6114
6115    For example, consider an accumulation of shorts into an int accumulator.
6116    On some targets it's possible to vectorize this pattern operating on 8
6117    shorts at a time (hence, the vectype for purposes of determining the
6118    vectorization factor should be V8HI); on the other hand, the vectype that
6119    is used to create the vector form is actually V4SI (the type of the result).
6120
6121    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6122    indicates what is the actual level of parallelism (V8HI in the example), so
6123    that the right vectorization factor would be derived.  This vectype
6124    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6125    be used to create the vectorized stmt.  The right vectype for the vectorized
6126    stmt is obtained from the type of the result X:
6127         get_vectype_for_scalar_type (TREE_TYPE (X))
6128
6129    This means that, contrary to "regular" reductions (or "regular" stmts in
6130    general), the following equation:
6131       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6132    does *NOT* necessarily hold for reduction patterns.  */
6133
6134 bool
6135 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6136                         gimple **vec_stmt, slp_tree slp_node,
6137                         slp_instance slp_node_instance,
6138                         stmt_vector_for_cost *cost_vec)
6139 {
6140   tree vec_dest;
6141   tree scalar_dest;
6142   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6143   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6144   tree vectype_in = NULL_TREE;
6145   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6146   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6147   enum tree_code code, orig_code;
6148   internal_fn reduc_fn;
6149   machine_mode vec_mode;
6150   int op_type;
6151   optab optab;
6152   tree new_temp = NULL_TREE;
6153   gimple *def_stmt;
6154   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6155   gimple *cond_reduc_def_stmt = NULL;
6156   enum tree_code cond_reduc_op_code = ERROR_MARK;
6157   tree scalar_type;
6158   bool is_simple_use;
6159   gimple *orig_stmt;
6160   stmt_vec_info orig_stmt_info = NULL;
6161   int i;
6162   int ncopies;
6163   int epilog_copies;
6164   stmt_vec_info prev_stmt_info, prev_phi_info;
6165   bool single_defuse_cycle = false;
6166   gimple *new_stmt = NULL;
6167   int j;
6168   tree ops[3];
6169   enum vect_def_type dts[3];
6170   bool nested_cycle = false, found_nested_cycle_def = false;
6171   bool double_reduc = false;
6172   basic_block def_bb;
6173   struct loop * def_stmt_loop, *outer_loop = NULL;
6174   tree def_arg;
6175   gimple *def_arg_stmt;
6176   auto_vec<tree> vec_oprnds0;
6177   auto_vec<tree> vec_oprnds1;
6178   auto_vec<tree> vec_oprnds2;
6179   auto_vec<tree> vect_defs;
6180   auto_vec<gimple *> phis;
6181   int vec_num;
6182   tree def0, tem;
6183   bool first_p = true;
6184   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6185   tree cond_reduc_val = NULL_TREE;
6186
6187   /* Make sure it was already recognized as a reduction computation.  */
6188   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6189       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6190     return false;
6191
6192   if (nested_in_vect_loop_p (loop, stmt))
6193     {
6194       outer_loop = loop;
6195       loop = loop->inner;
6196       nested_cycle = true;
6197     }
6198
6199   /* In case of reduction chain we switch to the first stmt in the chain, but
6200      we don't update STMT_INFO, since only the last stmt is marked as reduction
6201      and has reduction properties.  */
6202   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6203       && REDUC_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6204     {
6205       stmt = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6206       first_p = false;
6207     }
6208
6209   if (gimple_code (stmt) == GIMPLE_PHI)
6210     {
6211       /* Analysis is fully done on the reduction stmt invocation.  */
6212       if (! vec_stmt)
6213         {
6214           if (slp_node)
6215             slp_node_instance->reduc_phis = slp_node;
6216
6217           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6218           return true;
6219         }
6220
6221       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6222         /* Leave the scalar phi in place.  Note that checking
6223            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6224            for reductions involving a single statement.  */
6225         return true;
6226
6227       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6228       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6229         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6230
6231       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6232           == EXTRACT_LAST_REDUCTION)
6233         /* Leave the scalar phi in place.  */
6234         return true;
6235
6236       gcc_assert (is_gimple_assign (reduc_stmt));
6237       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6238         {
6239           tree op = gimple_op (reduc_stmt, k);
6240           if (op == gimple_phi_result (stmt))
6241             continue;
6242           if (k == 1
6243               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6244             continue;
6245           if (!vectype_in
6246               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6247                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6248             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6249           break;
6250         }
6251       gcc_assert (vectype_in);
6252
6253       if (slp_node)
6254         ncopies = 1;
6255       else
6256         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6257
6258       use_operand_p use_p;
6259       gimple *use_stmt;
6260       if (ncopies > 1
6261           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6262               <= vect_used_only_live)
6263           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6264           && (use_stmt == reduc_stmt
6265               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6266                   == reduc_stmt)))
6267         single_defuse_cycle = true;
6268
6269       /* Create the destination vector  */
6270       scalar_dest = gimple_assign_lhs (reduc_stmt);
6271       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6272
6273       if (slp_node)
6274         /* The size vect_schedule_slp_instance computes is off for us.  */
6275         vec_num = vect_get_num_vectors
6276           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6277            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6278            vectype_in);
6279       else
6280         vec_num = 1;
6281
6282       /* Generate the reduction PHIs upfront.  */
6283       prev_phi_info = NULL;
6284       for (j = 0; j < ncopies; j++)
6285         {
6286           if (j == 0 || !single_defuse_cycle)
6287             {
6288               for (i = 0; i < vec_num; i++)
6289                 {
6290                   /* Create the reduction-phi that defines the reduction
6291                      operand.  */
6292                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6293                   set_vinfo_for_stmt (new_phi,
6294                                       new_stmt_vec_info (new_phi, loop_vinfo));
6295
6296                   if (slp_node)
6297                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6298                   else
6299                     {
6300                       if (j == 0)
6301                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6302                       else
6303                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6304                       prev_phi_info = vinfo_for_stmt (new_phi);
6305                     }
6306                 }
6307             }
6308         }
6309
6310       return true;
6311     }
6312
6313   /* 1. Is vectorizable reduction?  */
6314   /* Not supportable if the reduction variable is used in the loop, unless
6315      it's a reduction chain.  */
6316   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6317       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6318     return false;
6319
6320   /* Reductions that are not used even in an enclosing outer-loop,
6321      are expected to be "live" (used out of the loop).  */
6322   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6323       && !STMT_VINFO_LIVE_P (stmt_info))
6324     return false;
6325
6326   /* 2. Has this been recognized as a reduction pattern?
6327
6328      Check if STMT represents a pattern that has been recognized
6329      in earlier analysis stages.  For stmts that represent a pattern,
6330      the STMT_VINFO_RELATED_STMT field records the last stmt in
6331      the original sequence that constitutes the pattern.  */
6332
6333   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6334   if (orig_stmt)
6335     {
6336       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6337       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6338       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6339     }
6340
6341   /* 3. Check the operands of the operation.  The first operands are defined
6342         inside the loop body. The last operand is the reduction variable,
6343         which is defined by the loop-header-phi.  */
6344
6345   gcc_assert (is_gimple_assign (stmt));
6346
6347   /* Flatten RHS.  */
6348   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6349     {
6350     case GIMPLE_BINARY_RHS:
6351       code = gimple_assign_rhs_code (stmt);
6352       op_type = TREE_CODE_LENGTH (code);
6353       gcc_assert (op_type == binary_op);
6354       ops[0] = gimple_assign_rhs1 (stmt);
6355       ops[1] = gimple_assign_rhs2 (stmt);
6356       break;
6357
6358     case GIMPLE_TERNARY_RHS:
6359       code = gimple_assign_rhs_code (stmt);
6360       op_type = TREE_CODE_LENGTH (code);
6361       gcc_assert (op_type == ternary_op);
6362       ops[0] = gimple_assign_rhs1 (stmt);
6363       ops[1] = gimple_assign_rhs2 (stmt);
6364       ops[2] = gimple_assign_rhs3 (stmt);
6365       break;
6366
6367     case GIMPLE_UNARY_RHS:
6368       return false;
6369
6370     default:
6371       gcc_unreachable ();
6372     }
6373
6374   if (code == COND_EXPR && slp_node)
6375     return false;
6376
6377   scalar_dest = gimple_assign_lhs (stmt);
6378   scalar_type = TREE_TYPE (scalar_dest);
6379   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6380       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6381     return false;
6382
6383   /* Do not try to vectorize bit-precision reductions.  */
6384   if (!type_has_mode_precision_p (scalar_type))
6385     return false;
6386
6387   /* All uses but the last are expected to be defined in the loop.
6388      The last use is the reduction variable.  In case of nested cycle this
6389      assumption is not true: we use reduc_index to record the index of the
6390      reduction variable.  */
6391   gimple *reduc_def_stmt = NULL;
6392   int reduc_index = -1;
6393   for (i = 0; i < op_type; i++)
6394     {
6395       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6396       if (i == 0 && code == COND_EXPR)
6397         continue;
6398
6399       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6400                                           &def_stmt, &dts[i], &tem);
6401       dt = dts[i];
6402       gcc_assert (is_simple_use);
6403       if (dt == vect_reduction_def)
6404         {
6405           reduc_def_stmt = def_stmt;
6406           reduc_index = i;
6407           continue;
6408         }
6409       else if (tem)
6410         {
6411           /* To properly compute ncopies we are interested in the widest
6412              input type in case we're looking at a widening accumulation.  */
6413           if (!vectype_in
6414               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6415                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6416             vectype_in = tem;
6417         }
6418
6419       if (dt != vect_internal_def
6420           && dt != vect_external_def
6421           && dt != vect_constant_def
6422           && dt != vect_induction_def
6423           && !(dt == vect_nested_cycle && nested_cycle))
6424         return false;
6425
6426       if (dt == vect_nested_cycle)
6427         {
6428           found_nested_cycle_def = true;
6429           reduc_def_stmt = def_stmt;
6430           reduc_index = i;
6431         }
6432
6433       if (i == 1 && code == COND_EXPR)
6434         {
6435           /* Record how value of COND_EXPR is defined.  */
6436           if (dt == vect_constant_def)
6437             {
6438               cond_reduc_dt = dt;
6439               cond_reduc_val = ops[i];
6440             }
6441           if (dt == vect_induction_def
6442               && def_stmt != NULL
6443               && is_nonwrapping_integer_induction (def_stmt, loop))
6444             {
6445               cond_reduc_dt = dt;
6446               cond_reduc_def_stmt = def_stmt;
6447             }
6448         }
6449     }
6450
6451   if (!vectype_in)
6452     vectype_in = vectype_out;
6453
6454   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6455      directy used in stmt.  */
6456   if (reduc_index == -1)
6457     {
6458       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6459         {
6460           if (dump_enabled_p ())
6461             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6462                              "in-order reduction chain without SLP.\n");
6463           return false;
6464         }
6465
6466       if (orig_stmt)
6467         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6468       else
6469         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6470     }
6471
6472   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6473     return false;
6474
6475   if (!(reduc_index == -1
6476         || dts[reduc_index] == vect_reduction_def
6477         || dts[reduc_index] == vect_nested_cycle
6478         || ((dts[reduc_index] == vect_internal_def
6479              || dts[reduc_index] == vect_external_def
6480              || dts[reduc_index] == vect_constant_def
6481              || dts[reduc_index] == vect_induction_def)
6482             && nested_cycle && found_nested_cycle_def)))
6483     {
6484       /* For pattern recognized stmts, orig_stmt might be a reduction,
6485          but some helper statements for the pattern might not, or
6486          might be COND_EXPRs with reduction uses in the condition.  */
6487       gcc_assert (orig_stmt);
6488       return false;
6489     }
6490
6491   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6492   enum vect_reduction_type v_reduc_type
6493     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6494   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6495
6496   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6497   /* If we have a condition reduction, see if we can simplify it further.  */
6498   if (v_reduc_type == COND_REDUCTION)
6499     {
6500       /* TODO: We can't yet handle reduction chains, since we need to treat
6501          each COND_EXPR in the chain specially, not just the last one.
6502          E.g. for:
6503
6504             x_1 = PHI <x_3, ...>
6505             x_2 = a_2 ? ... : x_1;
6506             x_3 = a_3 ? ... : x_2;
6507
6508          we're interested in the last element in x_3 for which a_2 || a_3
6509          is true, whereas the current reduction chain handling would
6510          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6511          as a reduction operation.  */
6512       if (reduc_index == -1)
6513         {
6514           if (dump_enabled_p ())
6515             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6516                              "conditional reduction chains not supported\n");
6517           return false;
6518         }
6519
6520       /* vect_is_simple_reduction ensured that operand 2 is the
6521          loop-carried operand.  */
6522       gcc_assert (reduc_index == 2);
6523
6524       /* Loop peeling modifies initial value of reduction PHI, which
6525          makes the reduction stmt to be transformed different to the
6526          original stmt analyzed.  We need to record reduction code for
6527          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6528          it can be used directly at transform stage.  */
6529       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6530           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6531         {
6532           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6533           gcc_assert (cond_reduc_dt == vect_constant_def);
6534           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6535         }
6536       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6537                                                vectype_in, OPTIMIZE_FOR_SPEED))
6538         {
6539           if (dump_enabled_p ())
6540             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6541                              "optimizing condition reduction with"
6542                              " FOLD_EXTRACT_LAST.\n");
6543           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6544         }
6545       else if (cond_reduc_dt == vect_induction_def)
6546         {
6547           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6548           tree base
6549             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6550           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6551
6552           gcc_assert (TREE_CODE (base) == INTEGER_CST
6553                       && TREE_CODE (step) == INTEGER_CST);
6554           cond_reduc_val = NULL_TREE;
6555           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6556              above base; punt if base is the minimum value of the type for
6557              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6558           if (tree_int_cst_sgn (step) == -1)
6559             {
6560               cond_reduc_op_code = MIN_EXPR;
6561               if (tree_int_cst_sgn (base) == -1)
6562                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6563               else if (tree_int_cst_lt (base,
6564                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6565                 cond_reduc_val
6566                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6567             }
6568           else
6569             {
6570               cond_reduc_op_code = MAX_EXPR;
6571               if (tree_int_cst_sgn (base) == 1)
6572                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6573               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6574                                         base))
6575                 cond_reduc_val
6576                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6577             }
6578           if (cond_reduc_val)
6579             {
6580               if (dump_enabled_p ())
6581                 dump_printf_loc (MSG_NOTE, vect_location,
6582                                  "condition expression based on "
6583                                  "integer induction.\n");
6584               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6585                 = INTEGER_INDUC_COND_REDUCTION;
6586             }
6587         }
6588       else if (cond_reduc_dt == vect_constant_def)
6589         {
6590           enum vect_def_type cond_initial_dt;
6591           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6592           tree cond_initial_val
6593             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6594
6595           gcc_assert (cond_reduc_val != NULL_TREE);
6596           vect_is_simple_use (cond_initial_val, loop_vinfo,
6597                               &def_stmt, &cond_initial_dt);
6598           if (cond_initial_dt == vect_constant_def
6599               && types_compatible_p (TREE_TYPE (cond_initial_val),
6600                                      TREE_TYPE (cond_reduc_val)))
6601             {
6602               tree e = fold_binary (LE_EXPR, boolean_type_node,
6603                                     cond_initial_val, cond_reduc_val);
6604               if (e && (integer_onep (e) || integer_zerop (e)))
6605                 {
6606                   if (dump_enabled_p ())
6607                     dump_printf_loc (MSG_NOTE, vect_location,
6608                                      "condition expression based on "
6609                                      "compile time constant.\n");
6610                   /* Record reduction code at analysis stage.  */
6611                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6612                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6613                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6614                     = CONST_COND_REDUCTION;
6615                 }
6616             }
6617         }
6618     }
6619
6620   if (orig_stmt)
6621     gcc_assert (tmp == orig_stmt
6622                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6623                     == orig_stmt));
6624   else
6625     /* We changed STMT to be the first stmt in reduction chain, hence we
6626        check that in this case the first element in the chain is STMT.  */
6627     gcc_assert (stmt == tmp
6628                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6629
6630   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6631     return false;
6632
6633   if (slp_node)
6634     ncopies = 1;
6635   else
6636     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6637
6638   gcc_assert (ncopies >= 1);
6639
6640   vec_mode = TYPE_MODE (vectype_in);
6641   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6642
6643   if (code == COND_EXPR)
6644     {
6645       /* Only call during the analysis stage, otherwise we'll lose
6646          STMT_VINFO_TYPE.  */
6647       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6648                                                 ops[reduc_index], 0, NULL,
6649                                                 cost_vec))
6650         {
6651           if (dump_enabled_p ())
6652             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6653                              "unsupported condition in reduction\n");
6654           return false;
6655         }
6656     }
6657   else
6658     {
6659       /* 4. Supportable by target?  */
6660
6661       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6662           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6663         {
6664           /* Shifts and rotates are only supported by vectorizable_shifts,
6665              not vectorizable_reduction.  */
6666           if (dump_enabled_p ())
6667             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6668                              "unsupported shift or rotation.\n");
6669           return false;
6670         }
6671
6672       /* 4.1. check support for the operation in the loop  */
6673       optab = optab_for_tree_code (code, vectype_in, optab_default);
6674       if (!optab)
6675         {
6676           if (dump_enabled_p ())
6677             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6678                              "no optab.\n");
6679
6680           return false;
6681         }
6682
6683       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6684         {
6685           if (dump_enabled_p ())
6686             dump_printf (MSG_NOTE, "op not supported by target.\n");
6687
6688           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6689               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6690             return false;
6691
6692           if (dump_enabled_p ())
6693             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6694         }
6695
6696       /* Worthwhile without SIMD support?  */
6697       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6698           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6699         {
6700           if (dump_enabled_p ())
6701             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6702                              "not worthwhile without SIMD support.\n");
6703
6704           return false;
6705         }
6706     }
6707
6708   /* 4.2. Check support for the epilog operation.
6709
6710           If STMT represents a reduction pattern, then the type of the
6711           reduction variable may be different than the type of the rest
6712           of the arguments.  For example, consider the case of accumulation
6713           of shorts into an int accumulator; The original code:
6714                         S1: int_a = (int) short_a;
6715           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6716
6717           was replaced with:
6718                         STMT: int_acc = widen_sum <short_a, int_acc>
6719
6720           This means that:
6721           1. The tree-code that is used to create the vector operation in the
6722              epilog code (that reduces the partial results) is not the
6723              tree-code of STMT, but is rather the tree-code of the original
6724              stmt from the pattern that STMT is replacing.  I.e, in the example
6725              above we want to use 'widen_sum' in the loop, but 'plus' in the
6726              epilog.
6727           2. The type (mode) we use to check available target support
6728              for the vector operation to be created in the *epilog*, is
6729              determined by the type of the reduction variable (in the example
6730              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6731              However the type (mode) we use to check available target support
6732              for the vector operation to be created *inside the loop*, is
6733              determined by the type of the other arguments to STMT (in the
6734              example we'd check this: optab_handler (widen_sum_optab,
6735              vect_short_mode)).
6736
6737           This is contrary to "regular" reductions, in which the types of all
6738           the arguments are the same as the type of the reduction variable.
6739           For "regular" reductions we can therefore use the same vector type
6740           (and also the same tree-code) when generating the epilog code and
6741           when generating the code inside the loop.  */
6742
6743   vect_reduction_type reduction_type
6744     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6745   if (orig_stmt
6746       && (reduction_type == TREE_CODE_REDUCTION
6747           || reduction_type == FOLD_LEFT_REDUCTION))
6748     {
6749       /* This is a reduction pattern: get the vectype from the type of the
6750          reduction variable, and get the tree-code from orig_stmt.  */
6751       orig_code = gimple_assign_rhs_code (orig_stmt);
6752       gcc_assert (vectype_out);
6753       vec_mode = TYPE_MODE (vectype_out);
6754     }
6755   else
6756     {
6757       /* Regular reduction: use the same vectype and tree-code as used for
6758          the vector code inside the loop can be used for the epilog code. */
6759       orig_code = code;
6760
6761       if (code == MINUS_EXPR)
6762         orig_code = PLUS_EXPR;
6763
6764       /* For simple condition reductions, replace with the actual expression
6765          we want to base our reduction around.  */
6766       if (reduction_type == CONST_COND_REDUCTION)
6767         {
6768           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6769           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6770         }
6771       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6772         orig_code = cond_reduc_op_code;
6773     }
6774
6775   if (nested_cycle)
6776     {
6777       def_bb = gimple_bb (reduc_def_stmt);
6778       def_stmt_loop = def_bb->loop_father;
6779       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6780                                        loop_preheader_edge (def_stmt_loop));
6781       if (TREE_CODE (def_arg) == SSA_NAME
6782           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6783           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6784           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6785           && vinfo_for_stmt (def_arg_stmt)
6786           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6787               == vect_double_reduction_def)
6788         double_reduc = true;
6789     }
6790
6791   reduc_fn = IFN_LAST;
6792
6793   if (reduction_type == TREE_CODE_REDUCTION
6794       || reduction_type == FOLD_LEFT_REDUCTION
6795       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6796       || reduction_type == CONST_COND_REDUCTION)
6797     {
6798       if (reduction_type == FOLD_LEFT_REDUCTION
6799           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6800           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6801         {
6802           if (reduc_fn != IFN_LAST
6803               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6804                                                   OPTIMIZE_FOR_SPEED))
6805             {
6806               if (dump_enabled_p ())
6807                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6808                                  "reduc op not supported by target.\n");
6809
6810               reduc_fn = IFN_LAST;
6811             }
6812         }
6813       else
6814         {
6815           if (!nested_cycle || double_reduc)
6816             {
6817               if (dump_enabled_p ())
6818                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6819                                  "no reduc code for scalar code.\n");
6820
6821               return false;
6822             }
6823         }
6824     }
6825   else if (reduction_type == COND_REDUCTION)
6826     {
6827       int scalar_precision
6828         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6829       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6830       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6831                                                 nunits_out);
6832
6833       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6834                                           OPTIMIZE_FOR_SPEED))
6835         reduc_fn = IFN_REDUC_MAX;
6836     }
6837
6838   if (reduction_type != EXTRACT_LAST_REDUCTION
6839       && reduc_fn == IFN_LAST
6840       && !nunits_out.is_constant ())
6841     {
6842       if (dump_enabled_p ())
6843         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6844                          "missing target support for reduction on"
6845                          " variable-length vectors.\n");
6846       return false;
6847     }
6848
6849   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6850       && ncopies > 1)
6851     {
6852       if (dump_enabled_p ())
6853         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6854                          "multiple types in double reduction or condition "
6855                          "reduction.\n");
6856       return false;
6857     }
6858
6859   /* For SLP reductions, see if there is a neutral value we can use.  */
6860   tree neutral_op = NULL_TREE;
6861   if (slp_node)
6862     neutral_op = neutral_op_for_slp_reduction
6863                    (slp_node_instance->reduc_phis, code,
6864                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6865
6866   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6867     {
6868       /* We can't support in-order reductions of code such as this:
6869
6870            for (int i = 0; i < n1; ++i)
6871              for (int j = 0; j < n2; ++j)
6872                l += a[j];
6873
6874          since GCC effectively transforms the loop when vectorizing:
6875
6876            for (int i = 0; i < n1 / VF; ++i)
6877              for (int j = 0; j < n2; ++j)
6878                for (int k = 0; k < VF; ++k)
6879                  l += a[j];
6880
6881          which is a reassociation of the original operation.  */
6882       if (dump_enabled_p ())
6883         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6884                          "in-order double reduction not supported.\n");
6885
6886       return false;
6887     }
6888
6889   if (reduction_type == FOLD_LEFT_REDUCTION
6890       && slp_node
6891       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6892     {
6893       /* We cannot use in-order reductions in this case because there is
6894          an implicit reassociation of the operations involved.  */
6895       if (dump_enabled_p ())
6896         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6897                          "in-order unchained SLP reductions not supported.\n");
6898       return false;
6899     }
6900
6901   /* For double reductions, and for SLP reductions with a neutral value,
6902      we construct a variable-length initial vector by loading a vector
6903      full of the neutral value and then shift-and-inserting the start
6904      values into the low-numbered elements.  */
6905   if ((double_reduc || neutral_op)
6906       && !nunits_out.is_constant ()
6907       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6908                                           vectype_out, OPTIMIZE_FOR_SPEED))
6909     {
6910       if (dump_enabled_p ())
6911         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6912                          "reduction on variable-length vectors requires"
6913                          " target support for a vector-shift-and-insert"
6914                          " operation.\n");
6915       return false;
6916     }
6917
6918   /* Check extra constraints for variable-length unchained SLP reductions.  */
6919   if (STMT_SLP_TYPE (stmt_info)
6920       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6921       && !nunits_out.is_constant ())
6922     {
6923       /* We checked above that we could build the initial vector when
6924          there's a neutral element value.  Check here for the case in
6925          which each SLP statement has its own initial value and in which
6926          that value needs to be repeated for every instance of the
6927          statement within the initial vector.  */
6928       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6929       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6930       if (!neutral_op
6931           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6932         {
6933           if (dump_enabled_p ())
6934             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6935                              "unsupported form of SLP reduction for"
6936                              " variable-length vectors: cannot build"
6937                              " initial vector.\n");
6938           return false;
6939         }
6940       /* The epilogue code relies on the number of elements being a multiple
6941          of the group size.  The duplicate-and-interleave approach to setting
6942          up the the initial vector does too.  */
6943       if (!multiple_p (nunits_out, group_size))
6944         {
6945           if (dump_enabled_p ())
6946             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6947                              "unsupported form of SLP reduction for"
6948                              " variable-length vectors: the vector size"
6949                              " is not a multiple of the number of results.\n");
6950           return false;
6951         }
6952     }
6953
6954   /* In case of widenning multiplication by a constant, we update the type
6955      of the constant to be the type of the other operand.  We check that the
6956      constant fits the type in the pattern recognition pass.  */
6957   if (code == DOT_PROD_EXPR
6958       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6959     {
6960       if (TREE_CODE (ops[0]) == INTEGER_CST)
6961         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6962       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6963         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6964       else
6965         {
6966           if (dump_enabled_p ())
6967             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6968                              "invalid types in dot-prod\n");
6969
6970           return false;
6971         }
6972     }
6973
6974   if (reduction_type == COND_REDUCTION)
6975     {
6976       widest_int ni;
6977
6978       if (! max_loop_iterations (loop, &ni))
6979         {
6980           if (dump_enabled_p ())
6981             dump_printf_loc (MSG_NOTE, vect_location,
6982                              "loop count not known, cannot create cond "
6983                              "reduction.\n");
6984           return false;
6985         }
6986       /* Convert backedges to iterations.  */
6987       ni += 1;
6988
6989       /* The additional index will be the same type as the condition.  Check
6990          that the loop can fit into this less one (because we'll use up the
6991          zero slot for when there are no matches).  */
6992       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6993       if (wi::geu_p (ni, wi::to_widest (max_index)))
6994         {
6995           if (dump_enabled_p ())
6996             dump_printf_loc (MSG_NOTE, vect_location,
6997                              "loop size is greater than data size.\n");
6998           return false;
6999         }
7000     }
7001
7002   /* In case the vectorization factor (VF) is bigger than the number
7003      of elements that we can fit in a vectype (nunits), we have to generate
7004      more than one vector stmt - i.e - we need to "unroll" the
7005      vector stmt by a factor VF/nunits.  For more details see documentation
7006      in vectorizable_operation.  */
7007
7008   /* If the reduction is used in an outer loop we need to generate
7009      VF intermediate results, like so (e.g. for ncopies=2):
7010         r0 = phi (init, r0)
7011         r1 = phi (init, r1)
7012         r0 = x0 + r0;
7013         r1 = x1 + r1;
7014     (i.e. we generate VF results in 2 registers).
7015     In this case we have a separate def-use cycle for each copy, and therefore
7016     for each copy we get the vector def for the reduction variable from the
7017     respective phi node created for this copy.
7018
7019     Otherwise (the reduction is unused in the loop nest), we can combine
7020     together intermediate results, like so (e.g. for ncopies=2):
7021         r = phi (init, r)
7022         r = x0 + r;
7023         r = x1 + r;
7024    (i.e. we generate VF/2 results in a single register).
7025    In this case for each copy we get the vector def for the reduction variable
7026    from the vectorized reduction operation generated in the previous iteration.
7027
7028    This only works when we see both the reduction PHI and its only consumer
7029    in vectorizable_reduction and there are no intermediate stmts
7030    participating.  */
7031   use_operand_p use_p;
7032   gimple *use_stmt;
7033   if (ncopies > 1
7034       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7035       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7036       && (use_stmt == stmt
7037           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7038     {
7039       single_defuse_cycle = true;
7040       epilog_copies = 1;
7041     }
7042   else
7043     epilog_copies = ncopies;
7044
7045   /* If the reduction stmt is one of the patterns that have lane
7046      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7047   if ((ncopies > 1
7048        && ! single_defuse_cycle)
7049       && (code == DOT_PROD_EXPR
7050           || code == WIDEN_SUM_EXPR
7051           || code == SAD_EXPR))
7052     {
7053       if (dump_enabled_p ())
7054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7055                          "multi def-use cycle not possible for lane-reducing "
7056                          "reduction operation\n");
7057       return false;
7058     }
7059
7060   if (slp_node)
7061     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7062   else
7063     vec_num = 1;
7064
7065   internal_fn cond_fn = get_conditional_internal_fn (code);
7066   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7067
7068   if (!vec_stmt) /* transformation not required.  */
7069     {
7070       if (first_p)
7071         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7072       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7073         {
7074           if (reduction_type != FOLD_LEFT_REDUCTION
7075               && (cond_fn == IFN_LAST
7076                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7077                                                       OPTIMIZE_FOR_SPEED)))
7078             {
7079               if (dump_enabled_p ())
7080                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7081                                  "can't use a fully-masked loop because no"
7082                                  " conditional operation is available.\n");
7083               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7084             }
7085           else if (reduc_index == -1)
7086             {
7087               if (dump_enabled_p ())
7088                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7089                                  "can't use a fully-masked loop for chained"
7090                                  " reductions.\n");
7091               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7092             }
7093           else
7094             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7095                                    vectype_in);
7096         }
7097       if (dump_enabled_p ()
7098           && reduction_type == FOLD_LEFT_REDUCTION)
7099         dump_printf_loc (MSG_NOTE, vect_location,
7100                          "using an in-order (fold-left) reduction.\n");
7101       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7102       return true;
7103     }
7104
7105   /* Transform.  */
7106
7107   if (dump_enabled_p ())
7108     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7109
7110   /* FORNOW: Multiple types are not supported for condition.  */
7111   if (code == COND_EXPR)
7112     gcc_assert (ncopies == 1);
7113
7114   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7115
7116   if (reduction_type == FOLD_LEFT_REDUCTION)
7117     return vectorize_fold_left_reduction
7118       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7119        reduc_fn, ops, vectype_in, reduc_index, masks);
7120
7121   if (reduction_type == EXTRACT_LAST_REDUCTION)
7122     {
7123       gcc_assert (!slp_node);
7124       return vectorizable_condition (stmt, gsi, vec_stmt,
7125                                      NULL, reduc_index, NULL, NULL);
7126     }
7127
7128   /* Create the destination vector  */
7129   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7130
7131   prev_stmt_info = NULL;
7132   prev_phi_info = NULL;
7133   if (!slp_node)
7134     {
7135       vec_oprnds0.create (1);
7136       vec_oprnds1.create (1);
7137       if (op_type == ternary_op)
7138         vec_oprnds2.create (1);
7139     }
7140
7141   phis.create (vec_num);
7142   vect_defs.create (vec_num);
7143   if (!slp_node)
7144     vect_defs.quick_push (NULL_TREE);
7145
7146   if (slp_node)
7147     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7148   else
7149     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7150
7151   for (j = 0; j < ncopies; j++)
7152     {
7153       if (code == COND_EXPR)
7154         {
7155           gcc_assert (!slp_node);
7156           vectorizable_condition (stmt, gsi, vec_stmt,
7157                                   PHI_RESULT (phis[0]),
7158                                   reduc_index, NULL, NULL);
7159           /* Multiple types are not supported for condition.  */
7160           break;
7161         }
7162
7163       /* Handle uses.  */
7164       if (j == 0)
7165         {
7166           if (slp_node)
7167             {
7168               /* Get vec defs for all the operands except the reduction index,
7169                  ensuring the ordering of the ops in the vector is kept.  */
7170               auto_vec<tree, 3> slp_ops;
7171               auto_vec<vec<tree>, 3> vec_defs;
7172
7173               slp_ops.quick_push (ops[0]);
7174               slp_ops.quick_push (ops[1]);
7175               if (op_type == ternary_op)
7176                 slp_ops.quick_push (ops[2]);
7177
7178               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7179
7180               vec_oprnds0.safe_splice (vec_defs[0]);
7181               vec_defs[0].release ();
7182               vec_oprnds1.safe_splice (vec_defs[1]);
7183               vec_defs[1].release ();
7184               if (op_type == ternary_op)
7185                 {
7186                   vec_oprnds2.safe_splice (vec_defs[2]);
7187                   vec_defs[2].release ();
7188                 }
7189             }
7190           else
7191             {
7192               vec_oprnds0.quick_push
7193                 (vect_get_vec_def_for_operand (ops[0], stmt));
7194               vec_oprnds1.quick_push
7195                 (vect_get_vec_def_for_operand (ops[1], stmt));
7196               if (op_type == ternary_op)
7197                 vec_oprnds2.quick_push
7198                   (vect_get_vec_def_for_operand (ops[2], stmt));
7199             }
7200         }
7201       else
7202         {
7203           if (!slp_node)
7204             {
7205               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7206
7207               if (single_defuse_cycle && reduc_index == 0)
7208                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7209               else
7210                 vec_oprnds0[0]
7211                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7212               if (single_defuse_cycle && reduc_index == 1)
7213                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7214               else
7215                 vec_oprnds1[0]
7216                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7217               if (op_type == ternary_op)
7218                 {
7219                   if (single_defuse_cycle && reduc_index == 2)
7220                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7221                   else
7222                     vec_oprnds2[0]
7223                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7224                 }
7225             }
7226         }
7227
7228       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7229         {
7230           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7231           if (masked_loop_p)
7232             {
7233               /* Make sure that the reduction accumulator is vop[0].  */
7234               if (reduc_index == 1)
7235                 {
7236                   gcc_assert (commutative_tree_code (code));
7237                   std::swap (vop[0], vop[1]);
7238                 }
7239               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7240                                               vectype_in, i * ncopies + j);
7241               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7242                                                         vop[0], vop[1],
7243                                                         vop[0]);
7244               new_temp = make_ssa_name (vec_dest, call);
7245               gimple_call_set_lhs (call, new_temp);
7246               gimple_call_set_nothrow (call, true);
7247               new_stmt = call;
7248             }
7249           else
7250             {
7251               if (op_type == ternary_op)
7252                 vop[2] = vec_oprnds2[i];
7253
7254               new_temp = make_ssa_name (vec_dest, new_stmt);
7255               new_stmt = gimple_build_assign (new_temp, code,
7256                                               vop[0], vop[1], vop[2]);
7257             }
7258           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7259
7260           if (slp_node)
7261             {
7262               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7263               vect_defs.quick_push (new_temp);
7264             }
7265           else
7266             vect_defs[0] = new_temp;
7267         }
7268
7269       if (slp_node)
7270         continue;
7271
7272       if (j == 0)
7273         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7274       else
7275         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7276
7277       prev_stmt_info = vinfo_for_stmt (new_stmt);
7278     }
7279
7280   /* Finalize the reduction-phi (set its arguments) and create the
7281      epilog reduction code.  */
7282   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7283     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7284
7285   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7286                                     epilog_copies, reduc_fn, phis,
7287                                     double_reduc, slp_node, slp_node_instance,
7288                                     cond_reduc_val, cond_reduc_op_code,
7289                                     neutral_op);
7290
7291   return true;
7292 }
7293
7294 /* Function vect_min_worthwhile_factor.
7295
7296    For a loop where we could vectorize the operation indicated by CODE,
7297    return the minimum vectorization factor that makes it worthwhile
7298    to use generic vectors.  */
7299 static unsigned int
7300 vect_min_worthwhile_factor (enum tree_code code)
7301 {
7302   switch (code)
7303     {
7304     case PLUS_EXPR:
7305     case MINUS_EXPR:
7306     case NEGATE_EXPR:
7307       return 4;
7308
7309     case BIT_AND_EXPR:
7310     case BIT_IOR_EXPR:
7311     case BIT_XOR_EXPR:
7312     case BIT_NOT_EXPR:
7313       return 2;
7314
7315     default:
7316       return INT_MAX;
7317     }
7318 }
7319
7320 /* Return true if VINFO indicates we are doing loop vectorization and if
7321    it is worth decomposing CODE operations into scalar operations for
7322    that loop's vectorization factor.  */
7323
7324 bool
7325 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7326 {
7327   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7328   unsigned HOST_WIDE_INT value;
7329   return (loop_vinfo
7330           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7331           && value >= vect_min_worthwhile_factor (code));
7332 }
7333
7334 /* Function vectorizable_induction
7335
7336    Check if PHI performs an induction computation that can be vectorized.
7337    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7338    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7339    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7340
7341 bool
7342 vectorizable_induction (gimple *phi,
7343                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7344                         gimple **vec_stmt, slp_tree slp_node,
7345                         stmt_vector_for_cost *cost_vec)
7346 {
7347   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7348   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7349   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7350   unsigned ncopies;
7351   bool nested_in_vect_loop = false;
7352   struct loop *iv_loop;
7353   tree vec_def;
7354   edge pe = loop_preheader_edge (loop);
7355   basic_block new_bb;
7356   tree new_vec, vec_init, vec_step, t;
7357   tree new_name;
7358   gimple *new_stmt;
7359   gphi *induction_phi;
7360   tree induc_def, vec_dest;
7361   tree init_expr, step_expr;
7362   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7363   unsigned i;
7364   tree expr;
7365   gimple_seq stmts;
7366   imm_use_iterator imm_iter;
7367   use_operand_p use_p;
7368   gimple *exit_phi;
7369   edge latch_e;
7370   tree loop_arg;
7371   gimple_stmt_iterator si;
7372   basic_block bb = gimple_bb (phi);
7373
7374   if (gimple_code (phi) != GIMPLE_PHI)
7375     return false;
7376
7377   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7378     return false;
7379
7380   /* Make sure it was recognized as induction computation.  */
7381   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7382     return false;
7383
7384   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7385   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7386
7387   if (slp_node)
7388     ncopies = 1;
7389   else
7390     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7391   gcc_assert (ncopies >= 1);
7392
7393   /* FORNOW. These restrictions should be relaxed.  */
7394   if (nested_in_vect_loop_p (loop, phi))
7395     {
7396       imm_use_iterator imm_iter;
7397       use_operand_p use_p;
7398       gimple *exit_phi;
7399       edge latch_e;
7400       tree loop_arg;
7401
7402       if (ncopies > 1)
7403         {
7404           if (dump_enabled_p ())
7405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7406                              "multiple types in nested loop.\n");
7407           return false;
7408         }
7409
7410       /* FORNOW: outer loop induction with SLP not supported.  */
7411       if (STMT_SLP_TYPE (stmt_info))
7412         return false;
7413
7414       exit_phi = NULL;
7415       latch_e = loop_latch_edge (loop->inner);
7416       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7417       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7418         {
7419           gimple *use_stmt = USE_STMT (use_p);
7420           if (is_gimple_debug (use_stmt))
7421             continue;
7422
7423           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7424             {
7425               exit_phi = use_stmt;
7426               break;
7427             }
7428         }
7429       if (exit_phi)
7430         {
7431           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7432           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7433                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7434             {
7435               if (dump_enabled_p ())
7436                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7437                                  "inner-loop induction only used outside "
7438                                  "of the outer vectorized loop.\n");
7439               return false;
7440             }
7441         }
7442
7443       nested_in_vect_loop = true;
7444       iv_loop = loop->inner;
7445     }
7446   else
7447     iv_loop = loop;
7448   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7449
7450   if (slp_node && !nunits.is_constant ())
7451     {
7452       /* The current SLP code creates the initial value element-by-element.  */
7453       if (dump_enabled_p ())
7454         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7455                          "SLP induction not supported for variable-length"
7456                          " vectors.\n");
7457       return false;
7458     }
7459
7460   if (!vec_stmt) /* transformation not required.  */
7461     {
7462       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7463       if (dump_enabled_p ())
7464         dump_printf_loc (MSG_NOTE, vect_location,
7465                          "=== vectorizable_induction ===\n");
7466       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7467       return true;
7468     }
7469
7470   /* Transform.  */
7471
7472   /* Compute a vector variable, initialized with the first VF values of
7473      the induction variable.  E.g., for an iv with IV_PHI='X' and
7474      evolution S, for a vector of 4 units, we want to compute:
7475      [X, X + S, X + 2*S, X + 3*S].  */
7476
7477   if (dump_enabled_p ())
7478     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7479
7480   latch_e = loop_latch_edge (iv_loop);
7481   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7482
7483   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7484   gcc_assert (step_expr != NULL_TREE);
7485
7486   pe = loop_preheader_edge (iv_loop);
7487   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7488                                      loop_preheader_edge (iv_loop));
7489
7490   stmts = NULL;
7491   if (!nested_in_vect_loop)
7492     {
7493       /* Convert the initial value to the desired type.  */
7494       tree new_type = TREE_TYPE (vectype);
7495       init_expr = gimple_convert (&stmts, new_type, init_expr);
7496
7497       /* If we are using the loop mask to "peel" for alignment then we need
7498          to adjust the start value here.  */
7499       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7500       if (skip_niters != NULL_TREE)
7501         {
7502           if (FLOAT_TYPE_P (vectype))
7503             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7504                                         skip_niters);
7505           else
7506             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7507           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7508                                          skip_niters, step_expr);
7509           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7510                                     init_expr, skip_step);
7511         }
7512     }
7513
7514   /* Convert the step to the desired type.  */
7515   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7516
7517   if (stmts)
7518     {
7519       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7520       gcc_assert (!new_bb);
7521     }
7522
7523   /* Find the first insertion point in the BB.  */
7524   si = gsi_after_labels (bb);
7525
7526   /* For SLP induction we have to generate several IVs as for example
7527      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7528      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7529      [VF*S, VF*S, VF*S, VF*S] for all.  */
7530   if (slp_node)
7531     {
7532       /* Enforced above.  */
7533       unsigned int const_nunits = nunits.to_constant ();
7534
7535       /* Generate [VF*S, VF*S, ... ].  */
7536       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7537         {
7538           expr = build_int_cst (integer_type_node, vf);
7539           expr = fold_convert (TREE_TYPE (step_expr), expr);
7540         }
7541       else
7542         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7543       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7544                               expr, step_expr);
7545       if (! CONSTANT_CLASS_P (new_name))
7546         new_name = vect_init_vector (phi, new_name,
7547                                      TREE_TYPE (step_expr), NULL);
7548       new_vec = build_vector_from_val (vectype, new_name);
7549       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7550
7551       /* Now generate the IVs.  */
7552       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7553       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7554       unsigned elts = const_nunits * nvects;
7555       unsigned nivs = least_common_multiple (group_size,
7556                                              const_nunits) / const_nunits;
7557       gcc_assert (elts % group_size == 0);
7558       tree elt = init_expr;
7559       unsigned ivn;
7560       for (ivn = 0; ivn < nivs; ++ivn)
7561         {
7562           tree_vector_builder elts (vectype, const_nunits, 1);
7563           stmts = NULL;
7564           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7565             {
7566               if (ivn*const_nunits + eltn >= group_size
7567                   && (ivn * const_nunits + eltn) % group_size == 0)
7568                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7569                                     elt, step_expr);
7570               elts.quick_push (elt);
7571             }
7572           vec_init = gimple_build_vector (&stmts, &elts);
7573           if (stmts)
7574             {
7575               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7576               gcc_assert (!new_bb);
7577             }
7578
7579           /* Create the induction-phi that defines the induction-operand.  */
7580           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7581           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7582           set_vinfo_for_stmt (induction_phi,
7583                               new_stmt_vec_info (induction_phi, loop_vinfo));
7584           induc_def = PHI_RESULT (induction_phi);
7585
7586           /* Create the iv update inside the loop  */
7587           vec_def = make_ssa_name (vec_dest);
7588           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7589           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7590           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7591
7592           /* Set the arguments of the phi node:  */
7593           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7594           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7595                        UNKNOWN_LOCATION);
7596
7597           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7598         }
7599
7600       /* Re-use IVs when we can.  */
7601       if (ivn < nvects)
7602         {
7603           unsigned vfp
7604             = least_common_multiple (group_size, const_nunits) / group_size;
7605           /* Generate [VF'*S, VF'*S, ... ].  */
7606           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7607             {
7608               expr = build_int_cst (integer_type_node, vfp);
7609               expr = fold_convert (TREE_TYPE (step_expr), expr);
7610             }
7611           else
7612             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7613           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7614                                   expr, step_expr);
7615           if (! CONSTANT_CLASS_P (new_name))
7616             new_name = vect_init_vector (phi, new_name,
7617                                          TREE_TYPE (step_expr), NULL);
7618           new_vec = build_vector_from_val (vectype, new_name);
7619           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7620           for (; ivn < nvects; ++ivn)
7621             {
7622               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7623               tree def;
7624               if (gimple_code (iv) == GIMPLE_PHI)
7625                 def = gimple_phi_result (iv);
7626               else
7627                 def = gimple_assign_lhs (iv);
7628               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7629                                               PLUS_EXPR,
7630                                               def, vec_step);
7631               if (gimple_code (iv) == GIMPLE_PHI)
7632                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7633               else
7634                 {
7635                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7636                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7637                 }
7638               set_vinfo_for_stmt (new_stmt,
7639                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7640               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7641             }
7642         }
7643
7644       return true;
7645     }
7646
7647   /* Create the vector that holds the initial_value of the induction.  */
7648   if (nested_in_vect_loop)
7649     {
7650       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7651          been created during vectorization of previous stmts.  We obtain it
7652          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7653       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7654       /* If the initial value is not of proper type, convert it.  */
7655       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7656         {
7657           new_stmt
7658             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7659                                                           vect_simple_var,
7660                                                           "vec_iv_"),
7661                                    VIEW_CONVERT_EXPR,
7662                                    build1 (VIEW_CONVERT_EXPR, vectype,
7663                                            vec_init));
7664           vec_init = gimple_assign_lhs (new_stmt);
7665           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7666                                                  new_stmt);
7667           gcc_assert (!new_bb);
7668           set_vinfo_for_stmt (new_stmt,
7669                               new_stmt_vec_info (new_stmt, loop_vinfo));
7670         }
7671     }
7672   else
7673     {
7674       /* iv_loop is the loop to be vectorized. Create:
7675          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7676       stmts = NULL;
7677       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7678
7679       unsigned HOST_WIDE_INT const_nunits;
7680       if (nunits.is_constant (&const_nunits))
7681         {
7682           tree_vector_builder elts (vectype, const_nunits, 1);
7683           elts.quick_push (new_name);
7684           for (i = 1; i < const_nunits; i++)
7685             {
7686               /* Create: new_name_i = new_name + step_expr  */
7687               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7688                                        new_name, step_expr);
7689               elts.quick_push (new_name);
7690             }
7691           /* Create a vector from [new_name_0, new_name_1, ...,
7692              new_name_nunits-1]  */
7693           vec_init = gimple_build_vector (&stmts, &elts);
7694         }
7695       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7696         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7697         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7698                                  new_name, step_expr);
7699       else
7700         {
7701           /* Build:
7702                 [base, base, base, ...]
7703                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7704           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7705           gcc_assert (flag_associative_math);
7706           tree index = build_index_vector (vectype, 0, 1);
7707           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7708                                                         new_name);
7709           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7710                                                         step_expr);
7711           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7712           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7713                                    vec_init, step_vec);
7714           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7715                                    vec_init, base_vec);
7716         }
7717
7718       if (stmts)
7719         {
7720           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7721           gcc_assert (!new_bb);
7722         }
7723     }
7724
7725
7726   /* Create the vector that holds the step of the induction.  */
7727   if (nested_in_vect_loop)
7728     /* iv_loop is nested in the loop to be vectorized. Generate:
7729        vec_step = [S, S, S, S]  */
7730     new_name = step_expr;
7731   else
7732     {
7733       /* iv_loop is the loop to be vectorized. Generate:
7734           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7735       gimple_seq seq = NULL;
7736       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7737         {
7738           expr = build_int_cst (integer_type_node, vf);
7739           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7740         }
7741       else
7742         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7743       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7744                                expr, step_expr);
7745       if (seq)
7746         {
7747           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7748           gcc_assert (!new_bb);
7749         }
7750     }
7751
7752   t = unshare_expr (new_name);
7753   gcc_assert (CONSTANT_CLASS_P (new_name)
7754               || TREE_CODE (new_name) == SSA_NAME);
7755   new_vec = build_vector_from_val (vectype, t);
7756   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7757
7758
7759   /* Create the following def-use cycle:
7760      loop prolog:
7761          vec_init = ...
7762          vec_step = ...
7763      loop:
7764          vec_iv = PHI <vec_init, vec_loop>
7765          ...
7766          STMT
7767          ...
7768          vec_loop = vec_iv + vec_step;  */
7769
7770   /* Create the induction-phi that defines the induction-operand.  */
7771   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7772   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7773   set_vinfo_for_stmt (induction_phi,
7774                       new_stmt_vec_info (induction_phi, loop_vinfo));
7775   induc_def = PHI_RESULT (induction_phi);
7776
7777   /* Create the iv update inside the loop  */
7778   vec_def = make_ssa_name (vec_dest);
7779   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7780   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7781   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7782
7783   /* Set the arguments of the phi node:  */
7784   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7785   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7786                UNKNOWN_LOCATION);
7787
7788   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7789
7790   /* In case that vectorization factor (VF) is bigger than the number
7791      of elements that we can fit in a vectype (nunits), we have to generate
7792      more than one vector stmt - i.e - we need to "unroll" the
7793      vector stmt by a factor VF/nunits.  For more details see documentation
7794      in vectorizable_operation.  */
7795
7796   if (ncopies > 1)
7797     {
7798       gimple_seq seq = NULL;
7799       stmt_vec_info prev_stmt_vinfo;
7800       /* FORNOW. This restriction should be relaxed.  */
7801       gcc_assert (!nested_in_vect_loop);
7802
7803       /* Create the vector that holds the step of the induction.  */
7804       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7805         {
7806           expr = build_int_cst (integer_type_node, nunits);
7807           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7808         }
7809       else
7810         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7811       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7812                                expr, step_expr);
7813       if (seq)
7814         {
7815           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7816           gcc_assert (!new_bb);
7817         }
7818
7819       t = unshare_expr (new_name);
7820       gcc_assert (CONSTANT_CLASS_P (new_name)
7821                   || TREE_CODE (new_name) == SSA_NAME);
7822       new_vec = build_vector_from_val (vectype, t);
7823       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7824
7825       vec_def = induc_def;
7826       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7827       for (i = 1; i < ncopies; i++)
7828         {
7829           /* vec_i = vec_prev + vec_step  */
7830           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7831                                           vec_def, vec_step);
7832           vec_def = make_ssa_name (vec_dest, new_stmt);
7833           gimple_assign_set_lhs (new_stmt, vec_def);
7834
7835           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7836           set_vinfo_for_stmt (new_stmt,
7837                               new_stmt_vec_info (new_stmt, loop_vinfo));
7838           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7839           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7840         }
7841     }
7842
7843   if (nested_in_vect_loop)
7844     {
7845       /* Find the loop-closed exit-phi of the induction, and record
7846          the final vector of induction results:  */
7847       exit_phi = NULL;
7848       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7849         {
7850           gimple *use_stmt = USE_STMT (use_p);
7851           if (is_gimple_debug (use_stmt))
7852             continue;
7853
7854           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7855             {
7856               exit_phi = use_stmt;
7857               break;
7858             }
7859         }
7860       if (exit_phi)
7861         {
7862           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7863           /* FORNOW. Currently not supporting the case that an inner-loop induction
7864              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7865           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7866                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7867
7868           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7869           if (dump_enabled_p ())
7870             {
7871               dump_printf_loc (MSG_NOTE, vect_location,
7872                                "vector of inductions after inner-loop:");
7873               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7874             }
7875         }
7876     }
7877
7878
7879   if (dump_enabled_p ())
7880     {
7881       dump_printf_loc (MSG_NOTE, vect_location,
7882                        "transform induction: created def-use cycle: ");
7883       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7884       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7885                         SSA_NAME_DEF_STMT (vec_def), 0);
7886     }
7887
7888   return true;
7889 }
7890
7891 /* Function vectorizable_live_operation.
7892
7893    STMT computes a value that is used outside the loop.  Check if
7894    it can be supported.  */
7895
7896 bool
7897 vectorizable_live_operation (gimple *stmt,
7898                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7899                              slp_tree slp_node, int slp_index,
7900                              gimple **vec_stmt,
7901                              stmt_vector_for_cost *)
7902 {
7903   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7904   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7905   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7906   imm_use_iterator imm_iter;
7907   tree lhs, lhs_type, bitsize, vec_bitsize;
7908   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7909   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7910   int ncopies;
7911   gimple *use_stmt;
7912   auto_vec<tree> vec_oprnds;
7913   int vec_entry = 0;
7914   poly_uint64 vec_index = 0;
7915
7916   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7917
7918   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7919     return false;
7920
7921   /* FORNOW.  CHECKME.  */
7922   if (nested_in_vect_loop_p (loop, stmt))
7923     return false;
7924
7925   /* If STMT is not relevant and it is a simple assignment and its inputs are
7926      invariant then it can remain in place, unvectorized.  The original last
7927      scalar value that it computes will be used.  */
7928   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7929     {
7930       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7931       if (dump_enabled_p ())
7932         dump_printf_loc (MSG_NOTE, vect_location,
7933                          "statement is simple and uses invariant.  Leaving in "
7934                          "place.\n");
7935       return true;
7936     }
7937
7938   if (slp_node)
7939     ncopies = 1;
7940   else
7941     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7942
7943   if (slp_node)
7944     {
7945       gcc_assert (slp_index >= 0);
7946
7947       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7948       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7949
7950       /* Get the last occurrence of the scalar index from the concatenation of
7951          all the slp vectors. Calculate which slp vector it is and the index
7952          within.  */
7953       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7954
7955       /* Calculate which vector contains the result, and which lane of
7956          that vector we need.  */
7957       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7958         {
7959           if (dump_enabled_p ())
7960             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7961                              "Cannot determine which vector holds the"
7962                              " final result.\n");
7963           return false;
7964         }
7965     }
7966
7967   if (!vec_stmt)
7968     {
7969       /* No transformation required.  */
7970       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7971         {
7972           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7973                                                OPTIMIZE_FOR_SPEED))
7974             {
7975               if (dump_enabled_p ())
7976                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977                                  "can't use a fully-masked loop because "
7978                                  "the target doesn't support extract last "
7979                                  "reduction.\n");
7980               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7981             }
7982           else if (slp_node)
7983             {
7984               if (dump_enabled_p ())
7985                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7986                                  "can't use a fully-masked loop because an "
7987                                  "SLP statement is live after the loop.\n");
7988               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7989             }
7990           else if (ncopies > 1)
7991             {
7992               if (dump_enabled_p ())
7993                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7994                                  "can't use a fully-masked loop because"
7995                                  " ncopies is greater than 1.\n");
7996               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7997             }
7998           else
7999             {
8000               gcc_assert (ncopies == 1 && !slp_node);
8001               vect_record_loop_mask (loop_vinfo,
8002                                      &LOOP_VINFO_MASKS (loop_vinfo),
8003                                      1, vectype);
8004             }
8005         }
8006       return true;
8007     }
8008
8009   /* If stmt has a related stmt, then use that for getting the lhs.  */
8010   if (is_pattern_stmt_p (stmt_info))
8011     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8012
8013   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8014         : gimple_get_lhs (stmt);
8015   lhs_type = TREE_TYPE (lhs);
8016
8017   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8018              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8019              : TYPE_SIZE (TREE_TYPE (vectype)));
8020   vec_bitsize = TYPE_SIZE (vectype);
8021
8022   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8023   tree vec_lhs, bitstart;
8024   if (slp_node)
8025     {
8026       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8027
8028       /* Get the correct slp vectorized stmt.  */
8029       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8030       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8031         vec_lhs = gimple_phi_result (phi);
8032       else
8033         vec_lhs = gimple_get_lhs (vec_stmt);
8034
8035       /* Get entry to use.  */
8036       bitstart = bitsize_int (vec_index);
8037       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8038     }
8039   else
8040     {
8041       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8042       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8043       gcc_checking_assert (ncopies == 1
8044                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8045
8046       /* For multiple copies, get the last copy.  */
8047       for (int i = 1; i < ncopies; ++i)
8048         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8049                                                   vec_lhs);
8050
8051       /* Get the last lane in the vector.  */
8052       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8053     }
8054
8055   gimple_seq stmts = NULL;
8056   tree new_tree;
8057   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8058     {
8059       /* Emit:
8060
8061            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8062
8063          where VEC_LHS is the vectorized live-out result and MASK is
8064          the loop mask for the final iteration.  */
8065       gcc_assert (ncopies == 1 && !slp_node);
8066       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8067       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8068                                       1, vectype, 0);
8069       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8070                                       scalar_type, mask, vec_lhs);
8071
8072       /* Convert the extracted vector element to the required scalar type.  */
8073       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8074     }
8075   else
8076     {
8077       tree bftype = TREE_TYPE (vectype);
8078       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8079         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8080       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8081       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8082                                        &stmts, true, NULL_TREE);
8083     }
8084
8085   if (stmts)
8086     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8087
8088   /* Replace use of lhs with newly computed result.  If the use stmt is a
8089      single arg PHI, just replace all uses of PHI result.  It's necessary
8090      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8091   use_operand_p use_p;
8092   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8093     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8094         && !is_gimple_debug (use_stmt))
8095     {
8096       if (gimple_code (use_stmt) == GIMPLE_PHI
8097           && gimple_phi_num_args (use_stmt) == 1)
8098         {
8099           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8100         }
8101       else
8102         {
8103           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8104             SET_USE (use_p, new_tree);
8105         }
8106       update_stmt (use_stmt);
8107     }
8108
8109   return true;
8110 }
8111
8112 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8113
8114 static void
8115 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8116 {
8117   ssa_op_iter op_iter;
8118   imm_use_iterator imm_iter;
8119   def_operand_p def_p;
8120   gimple *ustmt;
8121
8122   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8123     {
8124       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8125         {
8126           basic_block bb;
8127
8128           if (!is_gimple_debug (ustmt))
8129             continue;
8130
8131           bb = gimple_bb (ustmt);
8132
8133           if (!flow_bb_inside_loop_p (loop, bb))
8134             {
8135               if (gimple_debug_bind_p (ustmt))
8136                 {
8137                   if (dump_enabled_p ())
8138                     dump_printf_loc (MSG_NOTE, vect_location,
8139                                      "killing debug use\n");
8140
8141                   gimple_debug_bind_reset_value (ustmt);
8142                   update_stmt (ustmt);
8143                 }
8144               else
8145                 gcc_unreachable ();
8146             }
8147         }
8148     }
8149 }
8150
8151 /* Given loop represented by LOOP_VINFO, return true if computation of
8152    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8153    otherwise.  */
8154
8155 static bool
8156 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8157 {
8158   /* Constant case.  */
8159   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8160     {
8161       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8162       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8163
8164       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8165       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8166       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8167         return true;
8168     }
8169
8170   widest_int max;
8171   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8172   /* Check the upper bound of loop niters.  */
8173   if (get_max_loop_iterations (loop, &max))
8174     {
8175       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8176       signop sgn = TYPE_SIGN (type);
8177       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8178       if (max < type_max)
8179         return true;
8180     }
8181   return false;
8182 }
8183
8184 /* Return a mask type with half the number of elements as TYPE.  */
8185
8186 tree
8187 vect_halve_mask_nunits (tree type)
8188 {
8189   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8190   return build_truth_vector_type (nunits, current_vector_size);
8191 }
8192
8193 /* Return a mask type with twice as many elements as TYPE.  */
8194
8195 tree
8196 vect_double_mask_nunits (tree type)
8197 {
8198   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8199   return build_truth_vector_type (nunits, current_vector_size);
8200 }
8201
8202 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8203    contain a sequence of NVECTORS masks that each control a vector of type
8204    VECTYPE.  */
8205
8206 void
8207 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8208                        unsigned int nvectors, tree vectype)
8209 {
8210   gcc_assert (nvectors != 0);
8211   if (masks->length () < nvectors)
8212     masks->safe_grow_cleared (nvectors);
8213   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8214   /* The number of scalars per iteration and the number of vectors are
8215      both compile-time constants.  */
8216   unsigned int nscalars_per_iter
8217     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8218                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8219   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8220     {
8221       rgm->max_nscalars_per_iter = nscalars_per_iter;
8222       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8223     }
8224 }
8225
8226 /* Given a complete set of masks MASKS, extract mask number INDEX
8227    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8228    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8229
8230    See the comment above vec_loop_masks for more details about the mask
8231    arrangement.  */
8232
8233 tree
8234 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8235                     unsigned int nvectors, tree vectype, unsigned int index)
8236 {
8237   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8238   tree mask_type = rgm->mask_type;
8239
8240   /* Populate the rgroup's mask array, if this is the first time we've
8241      used it.  */
8242   if (rgm->masks.is_empty ())
8243     {
8244       rgm->masks.safe_grow_cleared (nvectors);
8245       for (unsigned int i = 0; i < nvectors; ++i)
8246         {
8247           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8248           /* Provide a dummy definition until the real one is available.  */
8249           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8250           rgm->masks[i] = mask;
8251         }
8252     }
8253
8254   tree mask = rgm->masks[index];
8255   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8256                 TYPE_VECTOR_SUBPARTS (vectype)))
8257     {
8258       /* A loop mask for data type X can be reused for data type Y
8259          if X has N times more elements than Y and if Y's elements
8260          are N times bigger than X's.  In this case each sequence
8261          of N elements in the loop mask will be all-zero or all-one.
8262          We can then view-convert the mask so that each sequence of
8263          N elements is replaced by a single element.  */
8264       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8265                               TYPE_VECTOR_SUBPARTS (vectype)));
8266       gimple_seq seq = NULL;
8267       mask_type = build_same_sized_truth_vector_type (vectype);
8268       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8269       if (seq)
8270         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8271     }
8272   return mask;
8273 }
8274
8275 /* Scale profiling counters by estimation for LOOP which is vectorized
8276    by factor VF.  */
8277
8278 static void
8279 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8280 {
8281   edge preheader = loop_preheader_edge (loop);
8282   /* Reduce loop iterations by the vectorization factor.  */
8283   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8284   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8285
8286   if (freq_h.nonzero_p ())
8287     {
8288       profile_probability p;
8289
8290       /* Avoid dropping loop body profile counter to 0 because of zero count
8291          in loop's preheader.  */
8292       if (!(freq_e == profile_count::zero ()))
8293         freq_e = freq_e.force_nonzero ();
8294       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8295       scale_loop_frequencies (loop, p);
8296     }
8297
8298   edge exit_e = single_exit (loop);
8299   exit_e->probability = profile_probability::always ()
8300                                  .apply_scale (1, new_est_niter + 1);
8301
8302   edge exit_l = single_pred_edge (loop->latch);
8303   profile_probability prob = exit_l->probability;
8304   exit_l->probability = exit_e->probability.invert ();
8305   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8306     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8307 }
8308
8309 /* Function vect_transform_loop.
8310
8311    The analysis phase has determined that the loop is vectorizable.
8312    Vectorize the loop - created vectorized stmts to replace the scalar
8313    stmts in the loop, and update the loop exit condition.
8314    Returns scalar epilogue loop if any.  */
8315
8316 struct loop *
8317 vect_transform_loop (loop_vec_info loop_vinfo)
8318 {
8319   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8320   struct loop *epilogue = NULL;
8321   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8322   int nbbs = loop->num_nodes;
8323   int i;
8324   tree niters_vector = NULL_TREE;
8325   tree step_vector = NULL_TREE;
8326   tree niters_vector_mult_vf = NULL_TREE;
8327   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8328   unsigned int lowest_vf = constant_lower_bound (vf);
8329   bool grouped_store;
8330   bool slp_scheduled = false;
8331   gimple *stmt, *pattern_stmt;
8332   gimple_seq pattern_def_seq = NULL;
8333   gimple_stmt_iterator pattern_def_si = gsi_none ();
8334   bool transform_pattern_stmt = false;
8335   bool check_profitability = false;
8336   unsigned int th;
8337
8338   if (dump_enabled_p ())
8339     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8340
8341   /* Use the more conservative vectorization threshold.  If the number
8342      of iterations is constant assume the cost check has been performed
8343      by our caller.  If the threshold makes all loops profitable that
8344      run at least the (estimated) vectorization factor number of times
8345      checking is pointless, too.  */
8346   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8347   if (th >= vect_vf_for_cost (loop_vinfo)
8348       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8349     {
8350       if (dump_enabled_p ())
8351         dump_printf_loc (MSG_NOTE, vect_location,
8352                          "Profitability threshold is %d loop iterations.\n",
8353                          th);
8354       check_profitability = true;
8355     }
8356
8357   /* Make sure there exists a single-predecessor exit bb.  Do this before
8358      versioning.   */
8359   edge e = single_exit (loop);
8360   if (! single_pred_p (e->dest))
8361     {
8362       split_loop_exit_edge (e);
8363       if (dump_enabled_p ())
8364         dump_printf (MSG_NOTE, "split exit edge\n");
8365     }
8366
8367   /* Version the loop first, if required, so the profitability check
8368      comes first.  */
8369
8370   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8371     {
8372       poly_uint64 versioning_threshold
8373         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8374       if (check_profitability
8375           && ordered_p (poly_uint64 (th), versioning_threshold))
8376         {
8377           versioning_threshold = ordered_max (poly_uint64 (th),
8378                                               versioning_threshold);
8379           check_profitability = false;
8380         }
8381       vect_loop_versioning (loop_vinfo, th, check_profitability,
8382                             versioning_threshold);
8383       check_profitability = false;
8384     }
8385
8386   /* Make sure there exists a single-predecessor exit bb also on the
8387      scalar loop copy.  Do this after versioning but before peeling
8388      so CFG structure is fine for both scalar and if-converted loop
8389      to make slpeel_duplicate_current_defs_from_edges face matched
8390      loop closed PHI nodes on the exit.  */
8391   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8392     {
8393       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8394       if (! single_pred_p (e->dest))
8395         {
8396           split_loop_exit_edge (e);
8397           if (dump_enabled_p ())
8398             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8399         }
8400     }
8401
8402   tree niters = vect_build_loop_niters (loop_vinfo);
8403   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8404   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8405   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8406   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8407                               &step_vector, &niters_vector_mult_vf, th,
8408                               check_profitability, niters_no_overflow);
8409
8410   if (niters_vector == NULL_TREE)
8411     {
8412       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8413           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8414           && known_eq (lowest_vf, vf))
8415         {
8416           niters_vector
8417             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8418                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8419           step_vector = build_one_cst (TREE_TYPE (niters));
8420         }
8421       else
8422         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8423                                      &step_vector, niters_no_overflow);
8424     }
8425
8426   /* 1) Make sure the loop header has exactly two entries
8427      2) Make sure we have a preheader basic block.  */
8428
8429   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8430
8431   split_edge (loop_preheader_edge (loop));
8432
8433   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8434       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8435     /* This will deal with any possible peeling.  */
8436     vect_prepare_for_masked_peels (loop_vinfo);
8437
8438   /* FORNOW: the vectorizer supports only loops which body consist
8439      of one basic block (header + empty latch). When the vectorizer will
8440      support more involved loop forms, the order by which the BBs are
8441      traversed need to be reconsidered.  */
8442
8443   for (i = 0; i < nbbs; i++)
8444     {
8445       basic_block bb = bbs[i];
8446       stmt_vec_info stmt_info;
8447
8448       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8449            gsi_next (&si))
8450         {
8451           gphi *phi = si.phi ();
8452           if (dump_enabled_p ())
8453             {
8454               dump_printf_loc (MSG_NOTE, vect_location,
8455                                "------>vectorizing phi: ");
8456               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8457             }
8458           stmt_info = vinfo_for_stmt (phi);
8459           if (!stmt_info)
8460             continue;
8461
8462           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8463             vect_loop_kill_debug_uses (loop, phi);
8464
8465           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8466               && !STMT_VINFO_LIVE_P (stmt_info))
8467             continue;
8468
8469           if (STMT_VINFO_VECTYPE (stmt_info)
8470               && (maybe_ne
8471                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8472               && dump_enabled_p ())
8473             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8474
8475           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8476                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8477                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8478               && ! PURE_SLP_STMT (stmt_info))
8479             {
8480               if (dump_enabled_p ())
8481                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8482               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8483             }
8484         }
8485
8486       pattern_stmt = NULL;
8487       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8488            !gsi_end_p (si) || transform_pattern_stmt;)
8489         {
8490           bool is_store;
8491
8492           if (transform_pattern_stmt)
8493             stmt = pattern_stmt;
8494           else
8495             {
8496               stmt = gsi_stmt (si);
8497               /* During vectorization remove existing clobber stmts.  */
8498               if (gimple_clobber_p (stmt))
8499                 {
8500                   unlink_stmt_vdef (stmt);
8501                   gsi_remove (&si, true);
8502                   release_defs (stmt);
8503                   continue;
8504                 }
8505             }
8506
8507           if (dump_enabled_p ())
8508             {
8509               dump_printf_loc (MSG_NOTE, vect_location,
8510                                "------>vectorizing statement: ");
8511               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8512             }
8513
8514           stmt_info = vinfo_for_stmt (stmt);
8515
8516           /* vector stmts created in the outer-loop during vectorization of
8517              stmts in an inner-loop may not have a stmt_info, and do not
8518              need to be vectorized.  */
8519           if (!stmt_info)
8520             {
8521               gsi_next (&si);
8522               continue;
8523             }
8524
8525           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8526             vect_loop_kill_debug_uses (loop, stmt);
8527
8528           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8529               && !STMT_VINFO_LIVE_P (stmt_info))
8530             {
8531               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8532                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8533                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8534                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8535                 {
8536                   stmt = pattern_stmt;
8537                   stmt_info = vinfo_for_stmt (stmt);
8538                 }
8539               else
8540                 {
8541                   gsi_next (&si);
8542                   continue;
8543                 }
8544             }
8545           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8546                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8547                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8548                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8549             transform_pattern_stmt = true;
8550
8551           /* If pattern statement has def stmts, vectorize them too.  */
8552           if (is_pattern_stmt_p (stmt_info))
8553             {
8554               if (pattern_def_seq == NULL)
8555                 {
8556                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8557                   pattern_def_si = gsi_start (pattern_def_seq);
8558                 }
8559               else if (!gsi_end_p (pattern_def_si))
8560                 gsi_next (&pattern_def_si);
8561               if (pattern_def_seq != NULL)
8562                 {
8563                   gimple *pattern_def_stmt = NULL;
8564                   stmt_vec_info pattern_def_stmt_info = NULL;
8565
8566                   while (!gsi_end_p (pattern_def_si))
8567                     {
8568                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8569                       pattern_def_stmt_info
8570                         = vinfo_for_stmt (pattern_def_stmt);
8571                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8572                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8573                         break;
8574                       gsi_next (&pattern_def_si);
8575                     }
8576
8577                   if (!gsi_end_p (pattern_def_si))
8578                     {
8579                       if (dump_enabled_p ())
8580                         {
8581                           dump_printf_loc (MSG_NOTE, vect_location,
8582                                            "==> vectorizing pattern def "
8583                                            "stmt: ");
8584                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8585                                             pattern_def_stmt, 0);
8586                         }
8587
8588                       stmt = pattern_def_stmt;
8589                       stmt_info = pattern_def_stmt_info;
8590                     }
8591                   else
8592                     {
8593                       pattern_def_si = gsi_none ();
8594                       transform_pattern_stmt = false;
8595                     }
8596                 }
8597               else
8598                 transform_pattern_stmt = false;
8599             }
8600
8601           if (STMT_VINFO_VECTYPE (stmt_info))
8602             {
8603               poly_uint64 nunits
8604                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8605               if (!STMT_SLP_TYPE (stmt_info)
8606                   && maybe_ne (nunits, vf)
8607                   && dump_enabled_p ())
8608                   /* For SLP VF is set according to unrolling factor, and not
8609                      to vector size, hence for SLP this print is not valid.  */
8610                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8611             }
8612
8613           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8614              reached.  */
8615           if (STMT_SLP_TYPE (stmt_info))
8616             {
8617               if (!slp_scheduled)
8618                 {
8619                   slp_scheduled = true;
8620
8621                   if (dump_enabled_p ())
8622                     dump_printf_loc (MSG_NOTE, vect_location,
8623                                      "=== scheduling SLP instances ===\n");
8624
8625                   vect_schedule_slp (loop_vinfo);
8626                 }
8627
8628               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8629               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8630                 {
8631                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8632                     {
8633                       pattern_def_seq = NULL;
8634                       gsi_next (&si);
8635                     }
8636                   continue;
8637                 }
8638             }
8639
8640           /* -------- vectorize statement ------------ */
8641           if (dump_enabled_p ())
8642             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8643
8644           grouped_store = false;
8645           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8646           if (is_store)
8647             {
8648               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8649                 {
8650                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8651                      interleaving chain was completed - free all the stores in
8652                      the chain.  */
8653                   gsi_next (&si);
8654                   vect_remove_stores (DR_GROUP_FIRST_ELEMENT (stmt_info));
8655                 }
8656               else
8657                 {
8658                   /* Free the attached stmt_vec_info and remove the stmt.  */
8659                   gimple *store = gsi_stmt (si);
8660                   free_stmt_vec_info (store);
8661                   unlink_stmt_vdef (store);
8662                   gsi_remove (&si, true);
8663                   release_defs (store);
8664                 }
8665
8666               /* Stores can only appear at the end of pattern statements.  */
8667               gcc_assert (!transform_pattern_stmt);
8668               pattern_def_seq = NULL;
8669             }
8670           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8671             {
8672               pattern_def_seq = NULL;
8673               gsi_next (&si);
8674             }
8675         }                       /* stmts in BB */
8676
8677       /* Stub out scalar statements that must not survive vectorization.
8678          Doing this here helps with grouped statements, or statements that
8679          are involved in patterns.  */
8680       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8681            !gsi_end_p (gsi); gsi_next (&gsi))
8682         {
8683           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8684           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8685             {
8686               tree lhs = gimple_get_lhs (call);
8687               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8688                 {
8689                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8690                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8691                   gsi_replace (&gsi, new_stmt, true);
8692                 }
8693             }
8694         }
8695     }                           /* BBs in loop */
8696
8697   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8698      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8699   if (integer_onep (step_vector))
8700     niters_no_overflow = true;
8701   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8702                            niters_vector_mult_vf, !niters_no_overflow);
8703
8704   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8705   scale_profile_for_vect_loop (loop, assumed_vf);
8706
8707   /* True if the final iteration might not handle a full vector's
8708      worth of scalar iterations.  */
8709   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8710   /* The minimum number of iterations performed by the epilogue.  This
8711      is 1 when peeling for gaps because we always need a final scalar
8712      iteration.  */
8713   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8714   /* +1 to convert latch counts to loop iteration counts,
8715      -min_epilogue_iters to remove iterations that cannot be performed
8716        by the vector code.  */
8717   int bias_for_lowest = 1 - min_epilogue_iters;
8718   int bias_for_assumed = bias_for_lowest;
8719   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8720   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8721     {
8722       /* When the amount of peeling is known at compile time, the first
8723          iteration will have exactly alignment_npeels active elements.
8724          In the worst case it will have at least one.  */
8725       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8726       bias_for_lowest += lowest_vf - min_first_active;
8727       bias_for_assumed += assumed_vf - min_first_active;
8728     }
8729   /* In these calculations the "- 1" converts loop iteration counts
8730      back to latch counts.  */
8731   if (loop->any_upper_bound)
8732     loop->nb_iterations_upper_bound
8733       = (final_iter_may_be_partial
8734          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8735                           lowest_vf) - 1
8736          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8737                            lowest_vf) - 1);
8738   if (loop->any_likely_upper_bound)
8739     loop->nb_iterations_likely_upper_bound
8740       = (final_iter_may_be_partial
8741          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8742                           + bias_for_lowest, lowest_vf) - 1
8743          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8744                            + bias_for_lowest, lowest_vf) - 1);
8745   if (loop->any_estimate)
8746     loop->nb_iterations_estimate
8747       = (final_iter_may_be_partial
8748          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8749                           assumed_vf) - 1
8750          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8751                            assumed_vf) - 1);
8752
8753   if (dump_enabled_p ())
8754     {
8755       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8756         {
8757           dump_printf_loc (MSG_NOTE, vect_location,
8758                            "LOOP VECTORIZED\n");
8759           if (loop->inner)
8760             dump_printf_loc (MSG_NOTE, vect_location,
8761                              "OUTER LOOP VECTORIZED\n");
8762           dump_printf (MSG_NOTE, "\n");
8763         }
8764       else
8765         {
8766           dump_printf_loc (MSG_NOTE, vect_location,
8767                            "LOOP EPILOGUE VECTORIZED (VS=");
8768           dump_dec (MSG_NOTE, current_vector_size);
8769           dump_printf (MSG_NOTE, ")\n");
8770         }
8771     }
8772
8773   /* Free SLP instances here because otherwise stmt reference counting
8774      won't work.  */
8775   slp_instance instance;
8776   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8777     vect_free_slp_instance (instance);
8778   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8779   /* Clear-up safelen field since its value is invalid after vectorization
8780      since vectorized loop can have loop-carried dependencies.  */
8781   loop->safelen = 0;
8782
8783   /* Don't vectorize epilogue for epilogue.  */
8784   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8785     epilogue = NULL;
8786
8787   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8788     epilogue = NULL;
8789
8790   if (epilogue)
8791     {
8792       auto_vector_sizes vector_sizes;
8793       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8794       unsigned int next_size = 0;
8795
8796       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8797           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8798           && known_eq (vf, lowest_vf))
8799         {
8800           unsigned int eiters
8801             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8802                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8803           eiters = eiters % lowest_vf;
8804           epilogue->nb_iterations_upper_bound = eiters - 1;
8805
8806           unsigned int ratio;
8807           while (next_size < vector_sizes.length ()
8808                  && !(constant_multiple_p (current_vector_size,
8809                                            vector_sizes[next_size], &ratio)
8810                       && eiters >= lowest_vf / ratio))
8811             next_size += 1;
8812         }
8813       else
8814         while (next_size < vector_sizes.length ()
8815                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8816           next_size += 1;
8817
8818       if (next_size == vector_sizes.length ())
8819         epilogue = NULL;
8820     }
8821
8822   if (epilogue)
8823     {
8824       epilogue->force_vectorize = loop->force_vectorize;
8825       epilogue->safelen = loop->safelen;
8826       epilogue->dont_vectorize = false;
8827
8828       /* We may need to if-convert epilogue to vectorize it.  */
8829       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8830         tree_if_conversion (epilogue);
8831     }
8832
8833   return epilogue;
8834 }
8835
8836 /* The code below is trying to perform simple optimization - revert
8837    if-conversion for masked stores, i.e. if the mask of a store is zero
8838    do not perform it and all stored value producers also if possible.
8839    For example,
8840      for (i=0; i<n; i++)
8841        if (c[i])
8842         {
8843           p1[i] += 1;
8844           p2[i] = p3[i] +2;
8845         }
8846    this transformation will produce the following semi-hammock:
8847
8848    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8849      {
8850        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8851        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8852        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8853        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8854        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8855        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8856      }
8857 */
8858
8859 void
8860 optimize_mask_stores (struct loop *loop)
8861 {
8862   basic_block *bbs = get_loop_body (loop);
8863   unsigned nbbs = loop->num_nodes;
8864   unsigned i;
8865   basic_block bb;
8866   struct loop *bb_loop;
8867   gimple_stmt_iterator gsi;
8868   gimple *stmt;
8869   auto_vec<gimple *> worklist;
8870
8871   vect_location = find_loop_location (loop);
8872   /* Pick up all masked stores in loop if any.  */
8873   for (i = 0; i < nbbs; i++)
8874     {
8875       bb = bbs[i];
8876       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8877            gsi_next (&gsi))
8878         {
8879           stmt = gsi_stmt (gsi);
8880           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8881             worklist.safe_push (stmt);
8882         }
8883     }
8884
8885   free (bbs);
8886   if (worklist.is_empty ())
8887     return;
8888
8889   /* Loop has masked stores.  */
8890   while (!worklist.is_empty ())
8891     {
8892       gimple *last, *last_store;
8893       edge e, efalse;
8894       tree mask;
8895       basic_block store_bb, join_bb;
8896       gimple_stmt_iterator gsi_to;
8897       tree vdef, new_vdef;
8898       gphi *phi;
8899       tree vectype;
8900       tree zero;
8901
8902       last = worklist.pop ();
8903       mask = gimple_call_arg (last, 2);
8904       bb = gimple_bb (last);
8905       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8906          the same loop as if_bb.  It could be different to LOOP when two
8907          level loop-nest is vectorized and mask_store belongs to the inner
8908          one.  */
8909       e = split_block (bb, last);
8910       bb_loop = bb->loop_father;
8911       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8912       join_bb = e->dest;
8913       store_bb = create_empty_bb (bb);
8914       add_bb_to_loop (store_bb, bb_loop);
8915       e->flags = EDGE_TRUE_VALUE;
8916       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8917       /* Put STORE_BB to likely part.  */
8918       efalse->probability = profile_probability::unlikely ();
8919       store_bb->count = efalse->count ();
8920       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8921       if (dom_info_available_p (CDI_DOMINATORS))
8922         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8923       if (dump_enabled_p ())
8924         dump_printf_loc (MSG_NOTE, vect_location,
8925                          "Create new block %d to sink mask stores.",
8926                          store_bb->index);
8927       /* Create vector comparison with boolean result.  */
8928       vectype = TREE_TYPE (mask);
8929       zero = build_zero_cst (vectype);
8930       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8931       gsi = gsi_last_bb (bb);
8932       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8933       /* Create new PHI node for vdef of the last masked store:
8934          .MEM_2 = VDEF <.MEM_1>
8935          will be converted to
8936          .MEM.3 = VDEF <.MEM_1>
8937          and new PHI node will be created in join bb
8938          .MEM_2 = PHI <.MEM_1, .MEM_3>
8939       */
8940       vdef = gimple_vdef (last);
8941       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8942       gimple_set_vdef (last, new_vdef);
8943       phi = create_phi_node (vdef, join_bb);
8944       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8945
8946       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8947       while (true)
8948         {
8949           gimple_stmt_iterator gsi_from;
8950           gimple *stmt1 = NULL;
8951
8952           /* Move masked store to STORE_BB.  */
8953           last_store = last;
8954           gsi = gsi_for_stmt (last);
8955           gsi_from = gsi;
8956           /* Shift GSI to the previous stmt for further traversal.  */
8957           gsi_prev (&gsi);
8958           gsi_to = gsi_start_bb (store_bb);
8959           gsi_move_before (&gsi_from, &gsi_to);
8960           /* Setup GSI_TO to the non-empty block start.  */
8961           gsi_to = gsi_start_bb (store_bb);
8962           if (dump_enabled_p ())
8963             {
8964               dump_printf_loc (MSG_NOTE, vect_location,
8965                                "Move stmt to created bb\n");
8966               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8967             }
8968           /* Move all stored value producers if possible.  */
8969           while (!gsi_end_p (gsi))
8970             {
8971               tree lhs;
8972               imm_use_iterator imm_iter;
8973               use_operand_p use_p;
8974               bool res;
8975
8976               /* Skip debug statements.  */
8977               if (is_gimple_debug (gsi_stmt (gsi)))
8978                 {
8979                   gsi_prev (&gsi);
8980                   continue;
8981                 }
8982               stmt1 = gsi_stmt (gsi);
8983               /* Do not consider statements writing to memory or having
8984                  volatile operand.  */
8985               if (gimple_vdef (stmt1)
8986                   || gimple_has_volatile_ops (stmt1))
8987                 break;
8988               gsi_from = gsi;
8989               gsi_prev (&gsi);
8990               lhs = gimple_get_lhs (stmt1);
8991               if (!lhs)
8992                 break;
8993
8994               /* LHS of vectorized stmt must be SSA_NAME.  */
8995               if (TREE_CODE (lhs) != SSA_NAME)
8996                 break;
8997
8998               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8999                 {
9000                   /* Remove dead scalar statement.  */
9001                   if (has_zero_uses (lhs))
9002                     {
9003                       gsi_remove (&gsi_from, true);
9004                       continue;
9005                     }
9006                 }
9007
9008               /* Check that LHS does not have uses outside of STORE_BB.  */
9009               res = true;
9010               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9011                 {
9012                   gimple *use_stmt;
9013                   use_stmt = USE_STMT (use_p);
9014                   if (is_gimple_debug (use_stmt))
9015                     continue;
9016                   if (gimple_bb (use_stmt) != store_bb)
9017                     {
9018                       res = false;
9019                       break;
9020                     }
9021                 }
9022               if (!res)
9023                 break;
9024
9025               if (gimple_vuse (stmt1)
9026                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9027                 break;
9028
9029               /* Can move STMT1 to STORE_BB.  */
9030               if (dump_enabled_p ())
9031                 {
9032                   dump_printf_loc (MSG_NOTE, vect_location,
9033                                    "Move stmt to created bb\n");
9034                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9035                 }
9036               gsi_move_before (&gsi_from, &gsi_to);
9037               /* Shift GSI_TO for further insertion.  */
9038               gsi_prev (&gsi_to);
9039             }
9040           /* Put other masked stores with the same mask to STORE_BB.  */
9041           if (worklist.is_empty ()
9042               || gimple_call_arg (worklist.last (), 2) != mask
9043               || worklist.last () != stmt1)
9044             break;
9045           last = worklist.pop ();
9046         }
9047       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9048     }
9049 }