gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   if (dump_enabled_p ())
 217     {
 218       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 219       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 220     }
 221   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 222     return false;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 228
 229       /* If a pattern statement has def stmts, analyze them too.  */
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             {
 237               dump_printf_loc (MSG_NOTE, vect_location,
 238                                "==> examining pattern def stmt: ");
 239               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 240                                 def_stmt_info->stmt, 0);
 241             }
 242           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 243                                              vf, mask_producers))
 244             return false;
 245         }
 246
 247       if (dump_enabled_p ())
 248         {
 249           dump_printf_loc (MSG_NOTE, vect_location,
 250                            "==> examining pattern statement: ");
 251           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 252         }
 253       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 254         return false;
 255     }
 256
 257   return true;
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static bool
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297   auto_vec<stmt_vec_info> mask_producers;
 298
 299   if (dump_enabled_p ())
 300     dump_printf_loc (MSG_NOTE, vect_location,
 301                      "=== vect_determine_vectorization_factor ===\n");
 302
 303   for (i = 0; i < nbbs; i++)
 304     {
 305       basic_block bb = bbs[i];
 306
 307       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 308            gsi_next (&si))
 309         {
 310           phi = si.phi ();
 311           stmt_info = vinfo_for_stmt (phi);
 312           if (dump_enabled_p ())
 313             {
 314               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 315               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 316             }
 317
 318           gcc_assert (stmt_info);
 319
 320           if (STMT_VINFO_RELEVANT_P (stmt_info)
 321               || STMT_VINFO_LIVE_P (stmt_info))
 322             {
 323               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 324               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 325
 326               if (dump_enabled_p ())
 327                 {
 328                   dump_printf_loc (MSG_NOTE, vect_location,
 329                                    "get vectype for scalar type:  ");
 330                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 331                   dump_printf (MSG_NOTE, "\n");
 332                 }
 333
 334               vectype = get_vectype_for_scalar_type (scalar_type);
 335               if (!vectype)
 336                 {
 337                   if (dump_enabled_p ())
 338                     {
 339                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 340                                        "not vectorized: unsupported "
 341                                        "data-type ");
 342                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 343                                          scalar_type);
 344                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 345                     }
 346                   return false;
 347                 }
 348               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 349
 350               if (dump_enabled_p ())
 351                 {
 352                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 353                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 354                   dump_printf (MSG_NOTE, "\n");
 355                 }
 356
 357               if (dump_enabled_p ())
 358                 {
 359                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 360                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 361                   dump_printf (MSG_NOTE, "\n");
 362                 }
 363
 364               vect_update_max_nunits (&vectorization_factor, vectype);
 365             }
 366         }
 367
 368       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 369            gsi_next (&si))
 370         {
 371           stmt_info = vinfo_for_stmt (gsi_stmt (si));
 372           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 373                                            &mask_producers))
 374             return false;
 375         }
 376     }
 377
 378   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 379   if (dump_enabled_p ())
 380     {
 381       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 382       dump_dec (MSG_NOTE, vectorization_factor);
 383       dump_printf (MSG_NOTE, "\n");
 384     }
 385
 386   if (known_le (vectorization_factor, 1U))
 387     {
 388       if (dump_enabled_p ())
 389         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 390                          "not vectorized: unsupported data-type\n");
 391       return false;
 392     }
 393   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 394
 395   for (i = 0; i < mask_producers.length (); i++)
 396     {
 397       stmt_info = mask_producers[i];
 398       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 399       if (!mask_type)
 400         return false;
 401       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 402     }
 403
 404   return true;
 405 }
 406
 407
 408 /* Function vect_is_simple_iv_evolution.
 409
 410    FORNOW: A simple evolution of an induction variables in the loop is
 411    considered a polynomial evolution.  */
 412
 413 static bool
 414 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 415                              tree * step)
 416 {
 417   tree init_expr;
 418   tree step_expr;
 419   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 420   basic_block bb;
 421
 422   /* When there is no evolution in this loop, the evolution function
 423      is not "simple".  */
 424   if (evolution_part == NULL_TREE)
 425     return false;
 426
 427   /* When the evolution is a polynomial of degree >= 2
 428      the evolution function is not "simple".  */
 429   if (tree_is_chrec (evolution_part))
 430     return false;
 431
 432   step_expr = evolution_part;
 433   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 434
 435   if (dump_enabled_p ())
 436     {
 437       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 438       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 439       dump_printf (MSG_NOTE, ",  init: ");
 440       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 441       dump_printf (MSG_NOTE, "\n");
 442     }
 443
 444   *init = init_expr;
 445   *step = step_expr;
 446
 447   if (TREE_CODE (step_expr) != INTEGER_CST
 448       && (TREE_CODE (step_expr) != SSA_NAME
 449           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 450               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 451           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 452               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 453                   || !flag_associative_math)))
 454       && (TREE_CODE (step_expr) != REAL_CST
 455           || !flag_associative_math))
 456     {
 457       if (dump_enabled_p ())
 458         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 459                          "step unknown.\n");
 460       return false;
 461     }
 462
 463   return true;
 464 }
 465
 466 /* Function vect_analyze_scalar_cycles_1.
 467
 468    Examine the cross iteration def-use cycles of scalar variables
 469    in LOOP.  LOOP_VINFO represents the loop that is now being
 470    considered for vectorization (can be LOOP, or an outer-loop
 471    enclosing LOOP).  */
 472
 473 static void
 474 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 475 {
 476   basic_block bb = loop->header;
 477   tree init, step;
 478   auto_vec<gimple *, 64> worklist;
 479   gphi_iterator gsi;
 480   bool double_reduc;
 481
 482   if (dump_enabled_p ())
 483     dump_printf_loc (MSG_NOTE, vect_location,
 484                      "=== vect_analyze_scalar_cycles ===\n");
 485
 486   /* First - identify all inductions.  Reduction detection assumes that all the
 487      inductions have been identified, therefore, this order must not be
 488      changed.  */
 489   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 490     {
 491       gphi *phi = gsi.phi ();
 492       tree access_fn = NULL;
 493       tree def = PHI_RESULT (phi);
 494       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 495
 496       if (dump_enabled_p ())
 497         {
 498           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 499           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 500         }
 501
 502       /* Skip virtual phi's.  The data dependences that are associated with
 503          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 504       if (virtual_operand_p (def))
 505         continue;
 506
 507       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 508
 509       /* Analyze the evolution function.  */
 510       access_fn = analyze_scalar_evolution (loop, def);
 511       if (access_fn)
 512         {
 513           STRIP_NOPS (access_fn);
 514           if (dump_enabled_p ())
 515             {
 516               dump_printf_loc (MSG_NOTE, vect_location,
 517                                "Access function of PHI: ");
 518               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 519               dump_printf (MSG_NOTE, "\n");
 520             }
 521           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522             = initial_condition_in_loop_num (access_fn, loop->num);
 523           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 524             = evolution_part_in_loop_num (access_fn, loop->num);
 525         }
 526
 527       if (!access_fn
 528           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 529           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 530               && TREE_CODE (step) != INTEGER_CST))
 531         {
 532           worklist.safe_push (phi);
 533           continue;
 534         }
 535
 536       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 537                   != NULL_TREE);
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 539
 540       if (dump_enabled_p ())
 541         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 542       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 543     }
 544
 545
 546   /* Second - identify all reductions and nested cycles.  */
 547   while (worklist.length () > 0)
 548     {
 549       gimple *phi = worklist.pop ();
 550       tree def = PHI_RESULT (phi);
 551       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 552       gimple *reduc_stmt;
 553
 554       if (dump_enabled_p ())
 555         {
 556           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 557           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 558         }
 559
 560       gcc_assert (!virtual_operand_p (def)
 561                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 562
 563       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 564                                                 &double_reduc, false);
 565       if (reduc_stmt)
 566         {
 567           if (double_reduc)
 568             {
 569               if (dump_enabled_p ())
 570                 dump_printf_loc (MSG_NOTE, vect_location,
 571                                  "Detected double reduction.\n");
 572
 573               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 574               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 575                                                     vect_double_reduction_def;
 576             }
 577           else
 578             {
 579               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 580                 {
 581                   if (dump_enabled_p ())
 582                     dump_printf_loc (MSG_NOTE, vect_location,
 583                                      "Detected vectorizable nested cycle.\n");
 584
 585                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 586                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 587                                                              vect_nested_cycle;
 588                 }
 589               else
 590                 {
 591                   if (dump_enabled_p ())
 592                     dump_printf_loc (MSG_NOTE, vect_location,
 593                                      "Detected reduction.\n");
 594
 595                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 596                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 597                                                            vect_reduction_def;
 598                   /* Store the reduction cycles for possible vectorization in
 599                      loop-aware SLP if it was not detected as reduction
 600                      chain.  */
 601                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 602                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 603                 }
 604             }
 605         }
 606       else
 607         if (dump_enabled_p ())
 608           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 609                            "Unknown def-use cycle pattern.\n");
 610     }
 611 }
 612
 613
 614 /* Function vect_analyze_scalar_cycles.
 615
 616    Examine the cross iteration def-use cycles of scalar variables, by
 617    analyzing the loop-header PHIs of scalar variables.  Classify each
 618    cycle as one of the following: invariant, induction, reduction, unknown.
 619    We do that for the loop represented by LOOP_VINFO, and also to its
 620    inner-loop, if exists.
 621    Examples for scalar cycles:
 622
 623    Example1: reduction:
 624
 625               loop1:
 626               for (i=0; i<N; i++)
 627                  sum += a[i];
 628
 629    Example2: induction:
 630
 631               loop2:
 632               for (i=0; i<N; i++)
 633                  a[i] = i;  */
 634
 635 static void
 636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 637 {
 638   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 639
 640   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 641
 642   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 643      Reductions in such inner-loop therefore have different properties than
 644      the reductions in the nest that gets vectorized:
 645      1. When vectorized, they are executed in the same order as in the original
 646         scalar loop, so we can't change the order of computation when
 647         vectorizing them.
 648      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 649         current checks are too strict.  */
 650
 651   if (loop->inner)
 652     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 653 }
 654
 655 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 656
 657 static void
 658 vect_fixup_reduc_chain (gimple *stmt)
 659 {
 660   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 661   gimple *stmtp;
 662   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 663               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 664   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 665   do
 666     {
 667       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 668       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 669       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 670       if (stmt)
 671         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 672           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 673     }
 674   while (stmt);
 675   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 676 }
 677
 678 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 679
 680 static void
 681 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 682 {
 683   gimple *first;
 684   unsigned i;
 685
 686   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 687     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 688       {
 689         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 690         while (next)
 691           {
 692             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 693               break;
 694             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 695           }
 696         /* If not all stmt in the chain are patterns try to handle
 697            the chain without patterns.  */
 698         if (! next)
 699           {
 700             vect_fixup_reduc_chain (first);
 701             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 702               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 703           }
 704       }
 705 }
 706
 707 /* Function vect_get_loop_niters.
 708
 709    Determine how many iterations the loop is executed and place it
 710    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 711    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 712    niter information holds in ASSUMPTIONS.
 713
 714    Return the loop exit condition.  */
 715
 716
 717 static gcond *
 718 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 719                       tree *number_of_iterations, tree *number_of_iterationsm1)
 720 {
 721   edge exit = single_exit (loop);
 722   struct tree_niter_desc niter_desc;
 723   tree niter_assumptions, niter, may_be_zero;
 724   gcond *cond = get_loop_exit_condition (loop);
 725
 726   *assumptions = boolean_true_node;
 727   *number_of_iterationsm1 = chrec_dont_know;
 728   *number_of_iterations = chrec_dont_know;
 729   if (dump_enabled_p ())
 730     dump_printf_loc (MSG_NOTE, vect_location,
 731                      "=== get_loop_niters ===\n");
 732
 733   if (!exit)
 734     return cond;
 735
 736   niter = chrec_dont_know;
 737   may_be_zero = NULL_TREE;
 738   niter_assumptions = boolean_true_node;
 739   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 740       || chrec_contains_undetermined (niter_desc.niter))
 741     return cond;
 742
 743   niter_assumptions = niter_desc.assumptions;
 744   may_be_zero = niter_desc.may_be_zero;
 745   niter = niter_desc.niter;
 746
 747   if (may_be_zero && integer_zerop (may_be_zero))
 748     may_be_zero = NULL_TREE;
 749
 750   if (may_be_zero)
 751     {
 752       if (COMPARISON_CLASS_P (may_be_zero))
 753         {
 754           /* Try to combine may_be_zero with assumptions, this can simplify
 755              computation of niter expression.  */
 756           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 757             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 758                                              niter_assumptions,
 759                                              fold_build1 (TRUTH_NOT_EXPR,
 760                                                           boolean_type_node,
 761                                                           may_be_zero));
 762           else
 763             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 764                                  build_int_cst (TREE_TYPE (niter), 0),
 765                                  rewrite_to_non_trapping_overflow (niter));
 766
 767           may_be_zero = NULL_TREE;
 768         }
 769       else if (integer_nonzerop (may_be_zero))
 770         {
 771           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 772           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 773           return cond;
 774         }
 775       else
 776         return cond;
 777     }
 778
 779   *assumptions = niter_assumptions;
 780   *number_of_iterationsm1 = niter;
 781
 782   /* We want the number of loop header executions which is the number
 783      of latch executions plus one.
 784      ???  For UINT_MAX latch executions this number overflows to zero
 785      for loops like do { n++; } while (n != 0);  */
 786   if (niter && !chrec_contains_undetermined (niter))
 787     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 788                           build_int_cst (TREE_TYPE (niter), 1));
 789   *number_of_iterations = niter;
 790
 791   return cond;
 792 }
 793
 794 /* Function bb_in_loop_p
 795
 796    Used as predicate for dfs order traversal of the loop bbs.  */
 797
 798 static bool
 799 bb_in_loop_p (const_basic_block bb, const void *data)
 800 {
 801   const struct loop *const loop = (const struct loop *)data;
 802   if (flow_bb_inside_loop_p (loop, bb))
 803     return true;
 804   return false;
 805 }
 806
 807
 808 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 809    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 810
 811 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
 812   : vec_info (vec_info::loop, init_cost (loop_in)),
 813     loop (loop_in),
 814     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 815     num_itersm1 (NULL_TREE),
 816     num_iters (NULL_TREE),
 817     num_iters_unchanged (NULL_TREE),
 818     num_iters_assumptions (NULL_TREE),
 819     th (0),
 820     versioning_threshold (0),
 821     vectorization_factor (0),
 822     max_vectorization_factor (0),
 823     mask_skip_niters (NULL_TREE),
 824     mask_compare_type (NULL_TREE),
 825     unaligned_dr (NULL),
 826     peeling_for_alignment (0),
 827     ptr_mask (0),
 828     ivexpr_map (NULL),
 829     slp_unrolling_factor (1),
 830     single_scalar_iteration_cost (0),
 831     vectorizable (false),
 832     can_fully_mask_p (true),
 833     fully_masked_p (false),
 834     peeling_for_gaps (false),
 835     peeling_for_niter (false),
 836     operands_swapped (false),
 837     no_data_dependencies (false),
 838     has_mask_store (false),
 839     scalar_loop (NULL),
 840     orig_loop_info (NULL)
 841 {
 842   /* Create/Update stmt_info for all stmts in the loop.  */
 843   basic_block *body = get_loop_body (loop);
 844   for (unsigned int i = 0; i < loop->num_nodes; i++)
 845     {
 846       basic_block bb = body[i];
 847       gimple_stmt_iterator si;
 848
 849       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 850         {
 851           gimple *phi = gsi_stmt (si);
 852           gimple_set_uid (phi, 0);
 853           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
 854         }
 855
 856       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 857         {
 858           gimple *stmt = gsi_stmt (si);
 859           gimple_set_uid (stmt, 0);
 860           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
 861         }
 862     }
 863   free (body);
 864
 865   /* CHECKME: We want to visit all BBs before their successors (except for
 866      latch blocks, for which this assertion wouldn't hold).  In the simple
 867      case of the loop forms we allow, a dfs order of the BBs would the same
 868      as reversed postorder traversal, so we are safe.  */
 869
 870   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 871                                           bbs, loop->num_nodes, loop);
 872   gcc_assert (nbbs == loop->num_nodes);
 873 }
 874
 875 /* Free all levels of MASKS.  */
 876
 877 void
 878 release_vec_loop_masks (vec_loop_masks *masks)
 879 {
 880   rgroup_masks *rgm;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (*masks, i, rgm)
 883     rgm->masks.release ();
 884   masks->release ();
 885 }
 886
 887 /* Free all memory used by the _loop_vec_info, as well as all the
 888    stmt_vec_info structs of all the stmts in the loop.  */
 889
 890 _loop_vec_info::~_loop_vec_info ()
 891 {
 892   int nbbs;
 893   gimple_stmt_iterator si;
 894   int j;
 895
 896   nbbs = loop->num_nodes;
 897   for (j = 0; j < nbbs; j++)
 898     {
 899       basic_block bb = bbs[j];
 900       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 901         free_stmt_vec_info (gsi_stmt (si));
 902
 903       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 904         {
 905           gimple *stmt = gsi_stmt (si);
 906
 907           /* We may have broken canonical form by moving a constant
 908              into RHS1 of a commutative op.  Fix such occurrences.  */
 909           if (operands_swapped && is_gimple_assign (stmt))
 910             {
 911               enum tree_code code = gimple_assign_rhs_code (stmt);
 912
 913               if ((code == PLUS_EXPR
 914                    || code == POINTER_PLUS_EXPR
 915                    || code == MULT_EXPR)
 916                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 917                 swap_ssa_operands (stmt,
 918                                    gimple_assign_rhs1_ptr (stmt),
 919                                    gimple_assign_rhs2_ptr (stmt));
 920               else if (code == COND_EXPR
 921                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 922                 {
 923                   tree cond_expr = gimple_assign_rhs1 (stmt);
 924                   enum tree_code cond_code = TREE_CODE (cond_expr);
 925
 926                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 927                     {
 928                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 929                                                                   0));
 930                       cond_code = invert_tree_comparison (cond_code,
 931                                                           honor_nans);
 932                       if (cond_code != ERROR_MARK)
 933                         {
 934                           TREE_SET_CODE (cond_expr, cond_code);
 935                           swap_ssa_operands (stmt,
 936                                              gimple_assign_rhs2_ptr (stmt),
 937                                              gimple_assign_rhs3_ptr (stmt));
 938                         }
 939                     }
 940                 }
 941             }
 942
 943           /* Free stmt_vec_info.  */
 944           free_stmt_vec_info (stmt);
 945           gsi_next (&si);
 946         }
 947     }
 948
 949   free (bbs);
 950
 951   release_vec_loop_masks (&masks);
 952   delete ivexpr_map;
 953
 954   loop->aux = NULL;
 955 }
 956
 957 /* Return an invariant or register for EXPR and emit necessary
 958    computations in the LOOP_VINFO loop preheader.  */
 959
 960 tree
 961 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 962 {
 963   if (is_gimple_reg (expr)
 964       || is_gimple_min_invariant (expr))
 965     return expr;
 966
 967   if (! loop_vinfo->ivexpr_map)
 968     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 969   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 970   if (! cached)
 971     {
 972       gimple_seq stmts = NULL;
 973       cached = force_gimple_operand (unshare_expr (expr),
 974                                      &stmts, true, NULL_TREE);
 975       if (stmts)
 976         {
 977           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 978           gsi_insert_seq_on_edge_immediate (e, stmts);
 979         }
 980     }
 981   return cached;
 982 }
 983
 984 /* Return true if we can use CMP_TYPE as the comparison type to produce
 985    all masks required to mask LOOP_VINFO.  */
 986
 987 static bool
 988 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 989 {
 990   rgroup_masks *rgm;
 991   unsigned int i;
 992   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 993     if (rgm->mask_type != NULL_TREE
 994         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 995                                             cmp_type, rgm->mask_type,
 996                                             OPTIMIZE_FOR_SPEED))
 997       return false;
 998   return true;
 999 }
1000
1001 /* Calculate the maximum number of scalars per iteration for every
1002    rgroup in LOOP_VINFO.  */
1003
1004 static unsigned int
1005 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1006 {
1007   unsigned int res = 1;
1008   unsigned int i;
1009   rgroup_masks *rgm;
1010   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1011     res = MAX (res, rgm->max_nscalars_per_iter);
1012   return res;
1013 }
1014
1015 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1016    whether we can actually generate the masks required.  Return true if so,
1017    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1018
1019 static bool
1020 vect_verify_full_masking (loop_vec_info loop_vinfo)
1021 {
1022   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1023   unsigned int min_ni_width;
1024
1025   /* Use a normal loop if there are no statements that need masking.
1026      This only happens in rare degenerate cases: it means that the loop
1027      has no loads, no stores, and no live-out values.  */
1028   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1029     return false;
1030
1031   /* Get the maximum number of iterations that is representable
1032      in the counter type.  */
1033   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1034   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1035
1036   /* Get a more refined estimate for the number of iterations.  */
1037   widest_int max_back_edges;
1038   if (max_loop_iterations (loop, &max_back_edges))
1039     max_ni = wi::smin (max_ni, max_back_edges + 1);
1040
1041   /* Account for rgroup masks, in which each bit is replicated N times.  */
1042   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1043
1044   /* Work out how many bits we need to represent the limit.  */
1045   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1046
1047   /* Find a scalar mode for which WHILE_ULT is supported.  */
1048   opt_scalar_int_mode cmp_mode_iter;
1049   tree cmp_type = NULL_TREE;
1050   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1051     {
1052       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1053       if (cmp_bits >= min_ni_width
1054           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1055         {
1056           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1057           if (this_type
1058               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1059             {
1060               /* Although we could stop as soon as we find a valid mode,
1061                  it's often better to continue until we hit Pmode, since the
1062                  operands to the WHILE are more likely to be reusable in
1063                  address calculations.  */
1064               cmp_type = this_type;
1065               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1066                 break;
1067             }
1068         }
1069     }
1070
1071   if (!cmp_type)
1072     return false;
1073
1074   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1075   return true;
1076 }
1077
1078 /* Calculate the cost of one scalar iteration of the loop.  */
1079 static void
1080 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1081 {
1082   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1083   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1084   int nbbs = loop->num_nodes, factor;
1085   int innerloop_iters, i;
1086
1087   /* Gather costs for statements in the scalar loop.  */
1088
1089   /* FORNOW.  */
1090   innerloop_iters = 1;
1091   if (loop->inner)
1092     innerloop_iters = 50; /* FIXME */
1093
1094   for (i = 0; i < nbbs; i++)
1095     {
1096       gimple_stmt_iterator si;
1097       basic_block bb = bbs[i];
1098
1099       if (bb->loop_father == loop->inner)
1100         factor = innerloop_iters;
1101       else
1102         factor = 1;
1103
1104       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1105         {
1106           gimple *stmt = gsi_stmt (si);
1107           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1108
1109           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1110             continue;
1111
1112           /* Skip stmts that are not vectorized inside the loop.  */
1113           if (stmt_info
1114               && !STMT_VINFO_RELEVANT_P (stmt_info)
1115               && (!STMT_VINFO_LIVE_P (stmt_info)
1116                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1117               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1118             continue;
1119
1120           vect_cost_for_stmt kind;
1121           if (STMT_VINFO_DATA_REF (stmt_info))
1122             {
1123               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1124                kind = scalar_load;
1125              else
1126                kind = scalar_store;
1127             }
1128           else
1129             kind = scalar_stmt;
1130
1131           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1132                             factor, kind, stmt_info, 0, vect_prologue);
1133         }
1134     }
1135
1136   /* Now accumulate cost.  */
1137   void *target_cost_data = init_cost (loop);
1138   stmt_info_for_cost *si;
1139   int j;
1140   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1141                     j, si)
1142     {
1143       struct _stmt_vec_info *stmt_info
1144         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1145       (void) add_stmt_cost (target_cost_data, si->count,
1146                             si->kind, stmt_info, si->misalign,
1147                             vect_body);
1148     }
1149   unsigned dummy, body_cost = 0;
1150   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1151   destroy_cost_data (target_cost_data);
1152   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1153 }
1154
1155
1156 /* Function vect_analyze_loop_form_1.
1157
1158    Verify that certain CFG restrictions hold, including:
1159    - the loop has a pre-header
1160    - the loop has a single entry and exit
1161    - the loop exit condition is simple enough
1162    - the number of iterations can be analyzed, i.e, a countable loop.  The
1163      niter could be analyzed under some assumptions.  */
1164
1165 bool
1166 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1167                           tree *assumptions, tree *number_of_iterationsm1,
1168                           tree *number_of_iterations, gcond **inner_loop_cond)
1169 {
1170   if (dump_enabled_p ())
1171     dump_printf_loc (MSG_NOTE, vect_location,
1172                      "=== vect_analyze_loop_form ===\n");
1173
1174   /* Different restrictions apply when we are considering an inner-most loop,
1175      vs. an outer (nested) loop.
1176      (FORNOW. May want to relax some of these restrictions in the future).  */
1177
1178   if (!loop->inner)
1179     {
1180       /* Inner-most loop.  We currently require that the number of BBs is
1181          exactly 2 (the header and latch).  Vectorizable inner-most loops
1182          look like this:
1183
1184                         (pre-header)
1185                            |
1186                           header <--------+
1187                            | |            |
1188                            | +--> latch --+
1189                            |
1190                         (exit-bb)  */
1191
1192       if (loop->num_nodes != 2)
1193         {
1194           if (dump_enabled_p ())
1195             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196                              "not vectorized: control flow in loop.\n");
1197           return false;
1198         }
1199
1200       if (empty_block_p (loop->header))
1201         {
1202           if (dump_enabled_p ())
1203             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1204                              "not vectorized: empty loop.\n");
1205           return false;
1206         }
1207     }
1208   else
1209     {
1210       struct loop *innerloop = loop->inner;
1211       edge entryedge;
1212
1213       /* Nested loop. We currently require that the loop is doubly-nested,
1214          contains a single inner loop, and the number of BBs is exactly 5.
1215          Vectorizable outer-loops look like this:
1216
1217                         (pre-header)
1218                            |
1219                           header <---+
1220                            |         |
1221                           inner-loop |
1222                            |         |
1223                           tail ------+
1224                            |
1225                         (exit-bb)
1226
1227          The inner-loop has the properties expected of inner-most loops
1228          as described above.  */
1229
1230       if ((loop->inner)->inner || (loop->inner)->next)
1231         {
1232           if (dump_enabled_p ())
1233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234                              "not vectorized: multiple nested loops.\n");
1235           return false;
1236         }
1237
1238       if (loop->num_nodes != 5)
1239         {
1240           if (dump_enabled_p ())
1241             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1242                              "not vectorized: control flow in loop.\n");
1243           return false;
1244         }
1245
1246       entryedge = loop_preheader_edge (innerloop);
1247       if (entryedge->src != loop->header
1248           || !single_exit (innerloop)
1249           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1250         {
1251           if (dump_enabled_p ())
1252             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1253                              "not vectorized: unsupported outerloop form.\n");
1254           return false;
1255         }
1256
1257       /* Analyze the inner-loop.  */
1258       tree inner_niterm1, inner_niter, inner_assumptions;
1259       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1260                                       &inner_assumptions, &inner_niterm1,
1261                                       &inner_niter, NULL)
1262           /* Don't support analyzing niter under assumptions for inner
1263              loop.  */
1264           || !integer_onep (inner_assumptions))
1265         {
1266           if (dump_enabled_p ())
1267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1268                              "not vectorized: Bad inner loop.\n");
1269           return false;
1270         }
1271
1272       if (!expr_invariant_in_loop_p (loop, inner_niter))
1273         {
1274           if (dump_enabled_p ())
1275             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276                              "not vectorized: inner-loop count not"
1277                              " invariant.\n");
1278           return false;
1279         }
1280
1281       if (dump_enabled_p ())
1282         dump_printf_loc (MSG_NOTE, vect_location,
1283                          "Considering outer-loop vectorization.\n");
1284     }
1285
1286   if (!single_exit (loop)
1287       || EDGE_COUNT (loop->header->preds) != 2)
1288     {
1289       if (dump_enabled_p ())
1290         {
1291           if (!single_exit (loop))
1292             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293                              "not vectorized: multiple exits.\n");
1294           else if (EDGE_COUNT (loop->header->preds) != 2)
1295             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1296                              "not vectorized: too many incoming edges.\n");
1297         }
1298       return false;
1299     }
1300
1301   /* We assume that the loop exit condition is at the end of the loop. i.e,
1302      that the loop is represented as a do-while (with a proper if-guard
1303      before the loop if needed), where the loop header contains all the
1304      executable statements, and the latch is empty.  */
1305   if (!empty_block_p (loop->latch)
1306       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1307     {
1308       if (dump_enabled_p ())
1309         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310                          "not vectorized: latch block not empty.\n");
1311       return false;
1312     }
1313
1314   /* Make sure the exit is not abnormal.  */
1315   edge e = single_exit (loop);
1316   if (e->flags & EDGE_ABNORMAL)
1317     {
1318       if (dump_enabled_p ())
1319         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1320                          "not vectorized: abnormal loop exit edge.\n");
1321       return false;
1322     }
1323
1324   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1325                                      number_of_iterationsm1);
1326   if (!*loop_cond)
1327     {
1328       if (dump_enabled_p ())
1329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330                          "not vectorized: complicated exit condition.\n");
1331       return false;
1332     }
1333
1334   if (integer_zerop (*assumptions)
1335       || !*number_of_iterations
1336       || chrec_contains_undetermined (*number_of_iterations))
1337     {
1338       if (dump_enabled_p ())
1339         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340                          "not vectorized: number of iterations cannot be "
1341                          "computed.\n");
1342       return false;
1343     }
1344
1345   if (integer_zerop (*number_of_iterations))
1346     {
1347       if (dump_enabled_p ())
1348         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1349                          "not vectorized: number of iterations = 0.\n");
1350       return false;
1351     }
1352
1353   return true;
1354 }
1355
1356 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1357
1358 loop_vec_info
1359 vect_analyze_loop_form (struct loop *loop)
1360 {
1361   tree assumptions, number_of_iterations, number_of_iterationsm1;
1362   gcond *loop_cond, *inner_loop_cond = NULL;
1363
1364   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1365                                   &assumptions, &number_of_iterationsm1,
1366                                   &number_of_iterations, &inner_loop_cond))
1367     return NULL;
1368
1369   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1370   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1371   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1372   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1373   if (!integer_onep (assumptions))
1374     {
1375       /* We consider to vectorize this loop by versioning it under
1376          some assumptions.  In order to do this, we need to clear
1377          existing information computed by scev and niter analyzer.  */
1378       scev_reset_htab ();
1379       free_numbers_of_iterations_estimates (loop);
1380       /* Also set flag for this loop so that following scev and niter
1381          analysis are done under the assumptions.  */
1382       loop_constraint_set (loop, LOOP_C_FINITE);
1383       /* Also record the assumptions for versioning.  */
1384       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1385     }
1386
1387   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1388     {
1389       if (dump_enabled_p ())
1390         {
1391           dump_printf_loc (MSG_NOTE, vect_location,
1392                            "Symbolic number of iterations is ");
1393           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1394           dump_printf (MSG_NOTE, "\n");
1395         }
1396     }
1397
1398   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1399   if (inner_loop_cond)
1400     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1401       = loop_exit_ctrl_vec_info_type;
1402
1403   gcc_assert (!loop->aux);
1404   loop->aux = loop_vinfo;
1405   return loop_vinfo;
1406 }
1407
1408
1409
1410 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1411    statements update the vectorization factor.  */
1412
1413 static void
1414 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1415 {
1416   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1417   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1418   int nbbs = loop->num_nodes;
1419   poly_uint64 vectorization_factor;
1420   int i;
1421
1422   if (dump_enabled_p ())
1423     dump_printf_loc (MSG_NOTE, vect_location,
1424                      "=== vect_update_vf_for_slp ===\n");
1425
1426   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1427   gcc_assert (known_ne (vectorization_factor, 0U));
1428
1429   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1430      vectorization factor of the loop is the unrolling factor required by
1431      the SLP instances.  If that unrolling factor is 1, we say, that we
1432      perform pure SLP on loop - cross iteration parallelism is not
1433      exploited.  */
1434   bool only_slp_in_loop = true;
1435   for (i = 0; i < nbbs; i++)
1436     {
1437       basic_block bb = bbs[i];
1438       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1439            gsi_next (&si))
1440         {
1441           gimple *stmt = gsi_stmt (si);
1442           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1443           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1444               && STMT_VINFO_RELATED_STMT (stmt_info))
1445             {
1446               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1447               stmt_info = vinfo_for_stmt (stmt);
1448             }
1449           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1450                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1451               && !PURE_SLP_STMT (stmt_info))
1452             /* STMT needs both SLP and loop-based vectorization.  */
1453             only_slp_in_loop = false;
1454         }
1455     }
1456
1457   if (only_slp_in_loop)
1458     {
1459       dump_printf_loc (MSG_NOTE, vect_location,
1460                        "Loop contains only SLP stmts\n");
1461       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1462     }
1463   else
1464     {
1465       dump_printf_loc (MSG_NOTE, vect_location,
1466                        "Loop contains SLP and non-SLP stmts\n");
1467       /* Both the vectorization factor and unroll factor have the form
1468          current_vector_size * X for some rational X, so they must have
1469          a common multiple.  */
1470       vectorization_factor
1471         = force_common_multiple (vectorization_factor,
1472                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1473     }
1474
1475   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1476   if (dump_enabled_p ())
1477     {
1478       dump_printf_loc (MSG_NOTE, vect_location,
1479                        "Updating vectorization factor to ");
1480       dump_dec (MSG_NOTE, vectorization_factor);
1481       dump_printf (MSG_NOTE, ".\n");
1482     }
1483 }
1484
1485 /* Return true if STMT_INFO describes a double reduction phi and if
1486    the other phi in the reduction is also relevant for vectorization.
1487    This rejects cases such as:
1488
1489       outer1:
1490         x_1 = PHI <x_3(outer2), ...>;
1491         ...
1492
1493       inner:
1494         x_2 = ...;
1495         ...
1496
1497       outer2:
1498         x_3 = PHI <x_2(inner)>;
1499
1500    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1501
1502 static bool
1503 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1504 {
1505   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1506     return false;
1507
1508   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1509   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1510 }
1511
1512 /* Function vect_analyze_loop_operations.
1513
1514    Scan the loop stmts and make sure they are all vectorizable.  */
1515
1516 static bool
1517 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1518 {
1519   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1520   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1521   int nbbs = loop->num_nodes;
1522   int i;
1523   stmt_vec_info stmt_info;
1524   bool need_to_vectorize = false;
1525   bool ok;
1526
1527   if (dump_enabled_p ())
1528     dump_printf_loc (MSG_NOTE, vect_location,
1529                      "=== vect_analyze_loop_operations ===\n");
1530
1531   stmt_vector_for_cost cost_vec;
1532   cost_vec.create (2);
1533
1534   for (i = 0; i < nbbs; i++)
1535     {
1536       basic_block bb = bbs[i];
1537
1538       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1539            gsi_next (&si))
1540         {
1541           gphi *phi = si.phi ();
1542           ok = true;
1543
1544           stmt_info = vinfo_for_stmt (phi);
1545           if (dump_enabled_p ())
1546             {
1547               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1548               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1549             }
1550           if (virtual_operand_p (gimple_phi_result (phi)))
1551             continue;
1552
1553           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1554              (i.e., a phi in the tail of the outer-loop).  */
1555           if (! is_loop_header_bb_p (bb))
1556             {
1557               /* FORNOW: we currently don't support the case that these phis
1558                  are not used in the outerloop (unless it is double reduction,
1559                  i.e., this phi is vect_reduction_def), cause this case
1560                  requires to actually do something here.  */
1561               if (STMT_VINFO_LIVE_P (stmt_info)
1562                   && !vect_active_double_reduction_p (stmt_info))
1563                 {
1564                   if (dump_enabled_p ())
1565                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566                                      "Unsupported loop-closed phi in "
1567                                      "outer-loop.\n");
1568                   return false;
1569                 }
1570
1571               /* If PHI is used in the outer loop, we check that its operand
1572                  is defined in the inner loop.  */
1573               if (STMT_VINFO_RELEVANT_P (stmt_info))
1574                 {
1575                   tree phi_op;
1576                   gimple *op_def_stmt;
1577
1578                   if (gimple_phi_num_args (phi) != 1)
1579                     return false;
1580
1581                   phi_op = PHI_ARG_DEF (phi, 0);
1582                   if (TREE_CODE (phi_op) != SSA_NAME)
1583                     return false;
1584
1585                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1586                   if (gimple_nop_p (op_def_stmt)
1587                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1588                       || !vinfo_for_stmt (op_def_stmt))
1589                     return false;
1590
1591                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1592                         != vect_used_in_outer
1593                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1594                            != vect_used_in_outer_by_reduction)
1595                     return false;
1596                 }
1597
1598               continue;
1599             }
1600
1601           gcc_assert (stmt_info);
1602
1603           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1604                || STMT_VINFO_LIVE_P (stmt_info))
1605               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1606             {
1607               /* A scalar-dependence cycle that we don't support.  */
1608               if (dump_enabled_p ())
1609                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1610                                  "not vectorized: scalar dependence cycle.\n");
1611               return false;
1612             }
1613
1614           if (STMT_VINFO_RELEVANT_P (stmt_info))
1615             {
1616               need_to_vectorize = true;
1617               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1618                   && ! PURE_SLP_STMT (stmt_info))
1619                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1620               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1621                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1622                        && ! PURE_SLP_STMT (stmt_info))
1623                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1624                                              &cost_vec);
1625             }
1626
1627           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1628           if (ok
1629               && STMT_VINFO_LIVE_P (stmt_info)
1630               && !PURE_SLP_STMT (stmt_info))
1631             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1632                                               &cost_vec);
1633
1634           if (!ok)
1635             {
1636               if (dump_enabled_p ())
1637                 {
1638                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639                                    "not vectorized: relevant phi not "
1640                                    "supported: ");
1641                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1642                 }
1643               return false;
1644             }
1645         }
1646
1647       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1648            gsi_next (&si))
1649         {
1650           gimple *stmt = gsi_stmt (si);
1651           if (!gimple_clobber_p (stmt)
1652               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1653                                      &cost_vec))
1654             return false;
1655         }
1656     } /* bbs */
1657
1658   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1659   cost_vec.release ();
1660
1661   /* All operations in the loop are either irrelevant (deal with loop
1662      control, or dead), or only used outside the loop and can be moved
1663      out of the loop (e.g. invariants, inductions).  The loop can be
1664      optimized away by scalar optimizations.  We're better off not
1665      touching this loop.  */
1666   if (!need_to_vectorize)
1667     {
1668       if (dump_enabled_p ())
1669         dump_printf_loc (MSG_NOTE, vect_location,
1670                          "All the computation can be taken out of the loop.\n");
1671       if (dump_enabled_p ())
1672         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673                          "not vectorized: redundant loop. no profit to "
1674                          "vectorize.\n");
1675       return false;
1676     }
1677
1678   return true;
1679 }
1680
1681 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1682    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1683    definitely no, or -1 if it's worth retrying.  */
1684
1685 static int
1686 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1687 {
1688   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1689   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1690
1691   /* Only fully-masked loops can have iteration counts less than the
1692      vectorization factor.  */
1693   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1694     {
1695       HOST_WIDE_INT max_niter;
1696
1697       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1698         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1699       else
1700         max_niter = max_stmt_executions_int (loop);
1701
1702       if (max_niter != -1
1703           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1704         {
1705           if (dump_enabled_p ())
1706             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707                              "not vectorized: iteration count smaller than "
1708                              "vectorization factor.\n");
1709           return 0;
1710         }
1711     }
1712
1713   int min_profitable_iters, min_profitable_estimate;
1714   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1715                                       &min_profitable_estimate);
1716
1717   if (min_profitable_iters < 0)
1718     {
1719       if (dump_enabled_p ())
1720         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1721                          "not vectorized: vectorization not profitable.\n");
1722       if (dump_enabled_p ())
1723         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724                          "not vectorized: vector version will never be "
1725                          "profitable.\n");
1726       return -1;
1727     }
1728
1729   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1730                                * assumed_vf);
1731
1732   /* Use the cost model only if it is more conservative than user specified
1733      threshold.  */
1734   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1735                                     min_profitable_iters);
1736
1737   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1738
1739   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1740       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1741     {
1742       if (dump_enabled_p ())
1743         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744                          "not vectorized: vectorization not profitable.\n");
1745       if (dump_enabled_p ())
1746         dump_printf_loc (MSG_NOTE, vect_location,
1747                          "not vectorized: iteration count smaller than user "
1748                          "specified loop bound parameter or minimum profitable "
1749                          "iterations (whichever is more conservative).\n");
1750       return 0;
1751     }
1752
1753   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1754   if (estimated_niter == -1)
1755     estimated_niter = likely_max_stmt_executions_int (loop);
1756   if (estimated_niter != -1
1757       && ((unsigned HOST_WIDE_INT) estimated_niter
1758           < MAX (th, (unsigned) min_profitable_estimate)))
1759     {
1760       if (dump_enabled_p ())
1761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1762                          "not vectorized: estimated iteration count too "
1763                          "small.\n");
1764       if (dump_enabled_p ())
1765         dump_printf_loc (MSG_NOTE, vect_location,
1766                          "not vectorized: estimated iteration count smaller "
1767                          "than specified loop bound parameter or minimum "
1768                          "profitable iterations (whichever is more "
1769                          "conservative).\n");
1770       return -1;
1771     }
1772
1773   return 1;
1774 }
1775
1776
1777 /* Function vect_analyze_loop_2.
1778
1779    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1780    for it.  The different analyses will record information in the
1781    loop_vec_info struct.  */
1782 static bool
1783 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1784 {
1785   bool ok;
1786   int res;
1787   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1788   poly_uint64 min_vf = 2;
1789   unsigned int n_stmts = 0;
1790
1791   /* The first group of checks is independent of the vector size.  */
1792   fatal = true;
1793
1794   /* Find all data references in the loop (which correspond to vdefs/vuses)
1795      and analyze their evolution in the loop.  */
1796
1797   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1798
1799   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1800   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1801     {
1802       if (dump_enabled_p ())
1803         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1804                          "not vectorized: loop nest containing two "
1805                          "or more consecutive inner loops cannot be "
1806                          "vectorized\n");
1807       return false;
1808     }
1809
1810   for (unsigned i = 0; i < loop->num_nodes; i++)
1811     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1812          !gsi_end_p (gsi); gsi_next (&gsi))
1813       {
1814         gimple *stmt = gsi_stmt (gsi);
1815         if (is_gimple_debug (stmt))
1816           continue;
1817         ++n_stmts;
1818         if (!find_data_references_in_stmt (loop, stmt,
1819                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1820           {
1821             if (is_gimple_call (stmt) && loop->safelen)
1822               {
1823                 tree fndecl = gimple_call_fndecl (stmt), op;
1824                 if (fndecl != NULL_TREE)
1825                   {
1826                     cgraph_node *node = cgraph_node::get (fndecl);
1827                     if (node != NULL && node->simd_clones != NULL)
1828                       {
1829                         unsigned int j, n = gimple_call_num_args (stmt);
1830                         for (j = 0; j < n; j++)
1831                           {
1832                             op = gimple_call_arg (stmt, j);
1833                             if (DECL_P (op)
1834                                 || (REFERENCE_CLASS_P (op)
1835                                     && get_base_address (op)))
1836                               break;
1837                           }
1838                         op = gimple_call_lhs (stmt);
1839                         /* Ignore #pragma omp declare simd functions
1840                            if they don't have data references in the
1841                            call stmt itself.  */
1842                         if (j == n
1843                             && !(op
1844                                  && (DECL_P (op)
1845                                      || (REFERENCE_CLASS_P (op)
1846                                          && get_base_address (op)))))
1847                           continue;
1848                       }
1849                   }
1850               }
1851             if (dump_enabled_p ())
1852               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853                                "not vectorized: loop contains function "
1854                                "calls or data references that cannot "
1855                                "be analyzed\n");
1856             return false;
1857           }
1858       }
1859
1860   /* Analyze the data references and also adjust the minimal
1861      vectorization factor according to the loads and stores.  */
1862
1863   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1864   if (!ok)
1865     {
1866       if (dump_enabled_p ())
1867         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868                          "bad data references.\n");
1869       return false;
1870     }
1871
1872   /* Classify all cross-iteration scalar data-flow cycles.
1873      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1874   vect_analyze_scalar_cycles (loop_vinfo);
1875
1876   vect_pattern_recog (loop_vinfo);
1877
1878   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1879
1880   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1881      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1882
1883   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1884   if (!ok)
1885     {
1886       if (dump_enabled_p ())
1887         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888                          "bad data access.\n");
1889       return false;
1890     }
1891
1892   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1893
1894   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1895   if (!ok)
1896     {
1897       if (dump_enabled_p ())
1898         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                          "unexpected pattern.\n");
1900       return false;
1901     }
1902
1903   /* While the rest of the analysis below depends on it in some way.  */
1904   fatal = false;
1905
1906   /* Analyze data dependences between the data-refs in the loop
1907      and adjust the maximum vectorization factor according to
1908      the dependences.
1909      FORNOW: fail at the first data dependence that we encounter.  */
1910
1911   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1912   if (!ok
1913       || (max_vf != MAX_VECTORIZATION_FACTOR
1914           && maybe_lt (max_vf, min_vf)))
1915     {
1916       if (dump_enabled_p ())
1917             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1918                              "bad data dependence.\n");
1919       return false;
1920     }
1921   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1922
1923   ok = vect_determine_vectorization_factor (loop_vinfo);
1924   if (!ok)
1925     {
1926       if (dump_enabled_p ())
1927         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1928                          "can't determine vectorization factor.\n");
1929       return false;
1930     }
1931   if (max_vf != MAX_VECTORIZATION_FACTOR
1932       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1933     {
1934       if (dump_enabled_p ())
1935         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936                          "bad data dependence.\n");
1937       return false;
1938     }
1939
1940   /* Compute the scalar iteration cost.  */
1941   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1942
1943   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1944   unsigned th;
1945
1946   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1947   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1948   if (!ok)
1949     return false;
1950
1951   /* If there are any SLP instances mark them as pure_slp.  */
1952   bool slp = vect_make_slp_decision (loop_vinfo);
1953   if (slp)
1954     {
1955       /* Find stmts that need to be both vectorized and SLPed.  */
1956       vect_detect_hybrid_slp (loop_vinfo);
1957
1958       /* Update the vectorization factor based on the SLP decision.  */
1959       vect_update_vf_for_slp (loop_vinfo);
1960     }
1961
1962   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1963
1964   /* We don't expect to have to roll back to anything other than an empty
1965      set of rgroups.  */
1966   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1967
1968   /* This is the point where we can re-start analysis with SLP forced off.  */
1969 start_over:
1970
1971   /* Now the vectorization factor is final.  */
1972   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1973   gcc_assert (known_ne (vectorization_factor, 0U));
1974
1975   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1976     {
1977       dump_printf_loc (MSG_NOTE, vect_location,
1978                        "vectorization_factor = ");
1979       dump_dec (MSG_NOTE, vectorization_factor);
1980       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1981                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1982     }
1983
1984   HOST_WIDE_INT max_niter
1985     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1986
1987   /* Analyze the alignment of the data-refs in the loop.
1988      Fail if a data reference is found that cannot be vectorized.  */
1989
1990   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1991   if (!ok)
1992     {
1993       if (dump_enabled_p ())
1994         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1995                          "bad data alignment.\n");
1996       return false;
1997     }
1998
1999   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2000      It is important to call pruning after vect_analyze_data_ref_accesses,
2001      since we use grouping information gathered by interleaving analysis.  */
2002   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2003   if (!ok)
2004     return false;
2005
2006   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2007      vectorization.  */
2008   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2009     {
2010     /* This pass will decide on using loop versioning and/or loop peeling in
2011        order to enhance the alignment of data references in the loop.  */
2012     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2013     if (!ok)
2014       {
2015         if (dump_enabled_p ())
2016           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2017                            "bad data alignment.\n");
2018         return false;
2019       }
2020     }
2021
2022   if (slp)
2023     {
2024       /* Analyze operations in the SLP instances.  Note this may
2025          remove unsupported SLP instances which makes the above
2026          SLP kind detection invalid.  */
2027       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2028       vect_slp_analyze_operations (loop_vinfo);
2029       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2030         goto again;
2031     }
2032
2033   /* Scan all the remaining operations in the loop that are not subject
2034      to SLP and make sure they are vectorizable.  */
2035   ok = vect_analyze_loop_operations (loop_vinfo);
2036   if (!ok)
2037     {
2038       if (dump_enabled_p ())
2039         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040                          "bad operation or unsupported loop bound.\n");
2041       return false;
2042     }
2043
2044   /* Decide whether to use a fully-masked loop for this vectorization
2045      factor.  */
2046   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2047     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2048        && vect_verify_full_masking (loop_vinfo));
2049   if (dump_enabled_p ())
2050     {
2051       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2052         dump_printf_loc (MSG_NOTE, vect_location,
2053                          "using a fully-masked loop.\n");
2054       else
2055         dump_printf_loc (MSG_NOTE, vect_location,
2056                          "not using a fully-masked loop.\n");
2057     }
2058
2059   /* If epilog loop is required because of data accesses with gaps,
2060      one additional iteration needs to be peeled.  Check if there is
2061      enough iterations for vectorization.  */
2062   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2063       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2064       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2065     {
2066       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2067       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2068
2069       if (known_lt (wi::to_widest (scalar_niters), vf))
2070         {
2071           if (dump_enabled_p ())
2072             dump_printf_loc (MSG_NOTE, vect_location,
2073                              "loop has no enough iterations to support"
2074                              " peeling for gaps.\n");
2075           return false;
2076         }
2077     }
2078
2079   /* Check the costings of the loop make vectorizing worthwhile.  */
2080   res = vect_analyze_loop_costing (loop_vinfo);
2081   if (res < 0)
2082     goto again;
2083   if (!res)
2084     {
2085       if (dump_enabled_p ())
2086         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2087                          "Loop costings not worthwhile.\n");
2088       return false;
2089     }
2090
2091   /* Decide whether we need to create an epilogue loop to handle
2092      remaining scalar iterations.  */
2093   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2094
2095   unsigned HOST_WIDE_INT const_vf;
2096   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2097     /* The main loop handles all iterations.  */
2098     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2099   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2100            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2101     {
2102       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2103                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2104                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2105         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2106     }
2107   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2108            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2109            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2110                 < (unsigned) exact_log2 (const_vf))
2111                /* In case of versioning, check if the maximum number of
2112                   iterations is greater than th.  If they are identical,
2113                   the epilogue is unnecessary.  */
2114                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2115                    || ((unsigned HOST_WIDE_INT) max_niter
2116                        > (th / const_vf) * const_vf))))
2117     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2118
2119   /* If an epilogue loop is required make sure we can create one.  */
2120   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2121       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2122     {
2123       if (dump_enabled_p ())
2124         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2125       if (!vect_can_advance_ivs_p (loop_vinfo)
2126           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2127                                            single_exit (LOOP_VINFO_LOOP
2128                                                          (loop_vinfo))))
2129         {
2130           if (dump_enabled_p ())
2131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2132                              "not vectorized: can't create required "
2133                              "epilog loop\n");
2134           goto again;
2135         }
2136     }
2137
2138   /* During peeling, we need to check if number of loop iterations is
2139      enough for both peeled prolog loop and vector loop.  This check
2140      can be merged along with threshold check of loop versioning, so
2141      increase threshold for this case if necessary.  */
2142   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2143     {
2144       poly_uint64 niters_th = 0;
2145
2146       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2147         {
2148           /* Niters for peeled prolog loop.  */
2149           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2150             {
2151               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2152               tree vectype
2153                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2154               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2155             }
2156           else
2157             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2158         }
2159
2160       /* Niters for at least one iteration of vectorized loop.  */
2161       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2162         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2163       /* One additional iteration because of peeling for gap.  */
2164       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2165         niters_th += 1;
2166       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2167     }
2168
2169   gcc_assert (known_eq (vectorization_factor,
2170                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2171
2172   /* Ok to vectorize!  */
2173   return true;
2174
2175 again:
2176   /* Try again with SLP forced off but if we didn't do any SLP there is
2177      no point in re-trying.  */
2178   if (!slp)
2179     return false;
2180
2181   /* If there are reduction chains re-trying will fail anyway.  */
2182   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2183     return false;
2184
2185   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2186      via interleaving or lane instructions.  */
2187   slp_instance instance;
2188   slp_tree node;
2189   unsigned i, j;
2190   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2191     {
2192       stmt_vec_info vinfo;
2193       vinfo = vinfo_for_stmt
2194           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2195       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2196         continue;
2197       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2198       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2199       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2200       if (! vect_store_lanes_supported (vectype, size, false)
2201          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2202          && ! vect_grouped_store_supported (vectype, size))
2203        return false;
2204       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2205         {
2206           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2207           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2208           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2209           size = STMT_VINFO_GROUP_SIZE (vinfo);
2210           vectype = STMT_VINFO_VECTYPE (vinfo);
2211           if (! vect_load_lanes_supported (vectype, size, false)
2212               && ! vect_grouped_load_supported (vectype, single_element_p,
2213                                                 size))
2214             return false;
2215         }
2216     }
2217
2218   if (dump_enabled_p ())
2219     dump_printf_loc (MSG_NOTE, vect_location,
2220                      "re-trying with SLP disabled\n");
2221
2222   /* Roll back state appropriately.  No SLP this time.  */
2223   slp = false;
2224   /* Restore vectorization factor as it were without SLP.  */
2225   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2226   /* Free the SLP instances.  */
2227   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2228     vect_free_slp_instance (instance);
2229   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2230   /* Reset SLP type to loop_vect on all stmts.  */
2231   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2232     {
2233       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2234       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2235            !gsi_end_p (si); gsi_next (&si))
2236         {
2237           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2238           STMT_SLP_TYPE (stmt_info) = loop_vect;
2239         }
2240       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2241            !gsi_end_p (si); gsi_next (&si))
2242         {
2243           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2244           STMT_SLP_TYPE (stmt_info) = loop_vect;
2245           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2246             {
2247               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2248               STMT_SLP_TYPE (stmt_info) = loop_vect;
2249               for (gimple_stmt_iterator pi
2250                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2251                    !gsi_end_p (pi); gsi_next (&pi))
2252                 {
2253                   gimple *pstmt = gsi_stmt (pi);
2254                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2255                 }
2256             }
2257         }
2258     }
2259   /* Free optimized alias test DDRS.  */
2260   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2261   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2262   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2263   /* Reset target cost data.  */
2264   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2265   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2266     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2267   /* Reset accumulated rgroup information.  */
2268   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2269   /* Reset assorted flags.  */
2270   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2271   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2272   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2273   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2274   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2275
2276   goto start_over;
2277 }
2278
2279 /* Function vect_analyze_loop.
2280
2281    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2282    for it.  The different analyses will record information in the
2283    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2284    be vectorized.  */
2285 loop_vec_info
2286 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2287 {
2288   loop_vec_info loop_vinfo;
2289   auto_vector_sizes vector_sizes;
2290
2291   /* Autodetect first vector size we try.  */
2292   current_vector_size = 0;
2293   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2294   unsigned int next_size = 0;
2295
2296   if (dump_enabled_p ())
2297     dump_printf_loc (MSG_NOTE, vect_location,
2298                      "===== analyze_loop_nest =====\n");
2299
2300   if (loop_outer (loop)
2301       && loop_vec_info_for_loop (loop_outer (loop))
2302       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2303     {
2304       if (dump_enabled_p ())
2305         dump_printf_loc (MSG_NOTE, vect_location,
2306                          "outer-loop already vectorized.\n");
2307       return NULL;
2308     }
2309
2310   poly_uint64 autodetected_vector_size = 0;
2311   while (1)
2312     {
2313       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2314       loop_vinfo = vect_analyze_loop_form (loop);
2315       if (!loop_vinfo)
2316         {
2317           if (dump_enabled_p ())
2318             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2319                              "bad loop form.\n");
2320           return NULL;
2321         }
2322
2323       bool fatal = false;
2324
2325       if (orig_loop_vinfo)
2326         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2327
2328       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2329         {
2330           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2331
2332           return loop_vinfo;
2333         }
2334
2335       delete loop_vinfo;
2336
2337       if (next_size == 0)
2338         autodetected_vector_size = current_vector_size;
2339
2340       if (next_size < vector_sizes.length ()
2341           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2342         next_size += 1;
2343
2344       if (fatal
2345           || next_size == vector_sizes.length ()
2346           || known_eq (current_vector_size, 0U))
2347         return NULL;
2348
2349       /* Try the next biggest vector size.  */
2350       current_vector_size = vector_sizes[next_size++];
2351       if (dump_enabled_p ())
2352         {
2353           dump_printf_loc (MSG_NOTE, vect_location,
2354                            "***** Re-trying analysis with "
2355                            "vector size ");
2356           dump_dec (MSG_NOTE, current_vector_size);
2357           dump_printf (MSG_NOTE, "\n");
2358         }
2359     }
2360 }
2361
2362 /* Return true if there is an in-order reduction function for CODE, storing
2363    it in *REDUC_FN if so.  */
2364
2365 static bool
2366 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2367 {
2368   switch (code)
2369     {
2370     case PLUS_EXPR:
2371       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2372       return true;
2373
2374     default:
2375       return false;
2376     }
2377 }
2378
2379 /* Function reduction_fn_for_scalar_code
2380
2381    Input:
2382    CODE - tree_code of a reduction operations.
2383
2384    Output:
2385    REDUC_FN - the corresponding internal function to be used to reduce the
2386       vector of partial results into a single scalar result, or IFN_LAST
2387       if the operation is a supported reduction operation, but does not have
2388       such an internal function.
2389
2390    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2391
2392 static bool
2393 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2394 {
2395   switch (code)
2396     {
2397       case MAX_EXPR:
2398         *reduc_fn = IFN_REDUC_MAX;
2399         return true;
2400
2401       case MIN_EXPR:
2402         *reduc_fn = IFN_REDUC_MIN;
2403         return true;
2404
2405       case PLUS_EXPR:
2406         *reduc_fn = IFN_REDUC_PLUS;
2407         return true;
2408
2409       case BIT_AND_EXPR:
2410         *reduc_fn = IFN_REDUC_AND;
2411         return true;
2412
2413       case BIT_IOR_EXPR:
2414         *reduc_fn = IFN_REDUC_IOR;
2415         return true;
2416
2417       case BIT_XOR_EXPR:
2418         *reduc_fn = IFN_REDUC_XOR;
2419         return true;
2420
2421       case MULT_EXPR:
2422       case MINUS_EXPR:
2423         *reduc_fn = IFN_LAST;
2424         return true;
2425
2426       default:
2427        return false;
2428     }
2429 }
2430
2431 /* If there is a neutral value X such that SLP reduction NODE would not
2432    be affected by the introduction of additional X elements, return that X,
2433    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2434    is true if the SLP statements perform a single reduction, false if each
2435    statement performs an independent reduction.  */
2436
2437 static tree
2438 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2439                               bool reduc_chain)
2440 {
2441   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2442   gimple *stmt = stmts[0];
2443   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2444   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2445   tree scalar_type = TREE_TYPE (vector_type);
2446   struct loop *loop = gimple_bb (stmt)->loop_father;
2447   gcc_assert (loop);
2448
2449   switch (code)
2450     {
2451     case WIDEN_SUM_EXPR:
2452     case DOT_PROD_EXPR:
2453     case SAD_EXPR:
2454     case PLUS_EXPR:
2455     case MINUS_EXPR:
2456     case BIT_IOR_EXPR:
2457     case BIT_XOR_EXPR:
2458       return build_zero_cst (scalar_type);
2459
2460     case MULT_EXPR:
2461       return build_one_cst (scalar_type);
2462
2463     case BIT_AND_EXPR:
2464       return build_all_ones_cst (scalar_type);
2465
2466     case MAX_EXPR:
2467     case MIN_EXPR:
2468       /* For MIN/MAX the initial values are neutral.  A reduction chain
2469          has only a single initial value, so that value is neutral for
2470          all statements.  */
2471       if (reduc_chain)
2472         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2473       return NULL_TREE;
2474
2475     default:
2476       return NULL_TREE;
2477     }
2478 }
2479
2480 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2481    STMT is printed with a message MSG. */
2482
2483 static void
2484 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2485 {
2486   dump_printf_loc (msg_type, vect_location, "%s", msg);
2487   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2488 }
2489
2490
2491 /* Detect SLP reduction of the form:
2492
2493    #a1 = phi <a5, a0>
2494    a2 = operation (a1)
2495    a3 = operation (a2)
2496    a4 = operation (a3)
2497    a5 = operation (a4)
2498
2499    #a = phi <a5>
2500
2501    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2502    FIRST_STMT is the first reduction stmt in the chain
2503    (a2 = operation (a1)).
2504
2505    Return TRUE if a reduction chain was detected.  */
2506
2507 static bool
2508 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2509                        gimple *first_stmt)
2510 {
2511   struct loop *loop = (gimple_bb (phi))->loop_father;
2512   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2513   enum tree_code code;
2514   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2515   stmt_vec_info use_stmt_info, current_stmt_info;
2516   tree lhs;
2517   imm_use_iterator imm_iter;
2518   use_operand_p use_p;
2519   int nloop_uses, size = 0, n_out_of_loop_uses;
2520   bool found = false;
2521
2522   if (loop != vect_loop)
2523     return false;
2524
2525   lhs = PHI_RESULT (phi);
2526   code = gimple_assign_rhs_code (first_stmt);
2527   while (1)
2528     {
2529       nloop_uses = 0;
2530       n_out_of_loop_uses = 0;
2531       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2532         {
2533           gimple *use_stmt = USE_STMT (use_p);
2534           if (is_gimple_debug (use_stmt))
2535             continue;
2536
2537           /* Check if we got back to the reduction phi.  */
2538           if (use_stmt == phi)
2539             {
2540               loop_use_stmt = use_stmt;
2541               found = true;
2542               break;
2543             }
2544
2545           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2546             {
2547               loop_use_stmt = use_stmt;
2548               nloop_uses++;
2549             }
2550            else
2551              n_out_of_loop_uses++;
2552
2553            /* There are can be either a single use in the loop or two uses in
2554               phi nodes.  */
2555            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2556              return false;
2557         }
2558
2559       if (found)
2560         break;
2561
2562       /* We reached a statement with no loop uses.  */
2563       if (nloop_uses == 0)
2564         return false;
2565
2566       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2567       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2568         return false;
2569
2570       if (!is_gimple_assign (loop_use_stmt)
2571           || code != gimple_assign_rhs_code (loop_use_stmt)
2572           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2573         return false;
2574
2575       /* Insert USE_STMT into reduction chain.  */
2576       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2577       if (current_stmt)
2578         {
2579           current_stmt_info = vinfo_for_stmt (current_stmt);
2580           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2581           GROUP_FIRST_ELEMENT (use_stmt_info)
2582             = GROUP_FIRST_ELEMENT (current_stmt_info);
2583         }
2584       else
2585         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2586
2587       lhs = gimple_assign_lhs (loop_use_stmt);
2588       current_stmt = loop_use_stmt;
2589       size++;
2590    }
2591
2592   if (!found || loop_use_stmt != phi || size < 2)
2593     return false;
2594
2595   /* Swap the operands, if needed, to make the reduction operand be the second
2596      operand.  */
2597   lhs = PHI_RESULT (phi);
2598   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2599   while (next_stmt)
2600     {
2601       if (gimple_assign_rhs2 (next_stmt) == lhs)
2602         {
2603           tree op = gimple_assign_rhs1 (next_stmt);
2604           gimple *def_stmt = NULL;
2605
2606           if (TREE_CODE (op) == SSA_NAME)
2607             def_stmt = SSA_NAME_DEF_STMT (op);
2608
2609           /* Check that the other def is either defined in the loop
2610              ("vect_internal_def"), or it's an induction (defined by a
2611              loop-header phi-node).  */
2612           if (def_stmt
2613               && gimple_bb (def_stmt)
2614               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2615               && (is_gimple_assign (def_stmt)
2616                   || is_gimple_call (def_stmt)
2617                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2618                            == vect_induction_def
2619                   || (gimple_code (def_stmt) == GIMPLE_PHI
2620                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2621                                   == vect_internal_def
2622                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2623             {
2624               lhs = gimple_assign_lhs (next_stmt);
2625               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2626               continue;
2627             }
2628
2629           return false;
2630         }
2631       else
2632         {
2633           tree op = gimple_assign_rhs2 (next_stmt);
2634           gimple *def_stmt = NULL;
2635
2636           if (TREE_CODE (op) == SSA_NAME)
2637             def_stmt = SSA_NAME_DEF_STMT (op);
2638
2639           /* Check that the other def is either defined in the loop
2640             ("vect_internal_def"), or it's an induction (defined by a
2641             loop-header phi-node).  */
2642           if (def_stmt
2643               && gimple_bb (def_stmt)
2644               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2645               && (is_gimple_assign (def_stmt)
2646                   || is_gimple_call (def_stmt)
2647                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2648                               == vect_induction_def
2649                   || (gimple_code (def_stmt) == GIMPLE_PHI
2650                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2651                                   == vect_internal_def
2652                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2653             {
2654               if (dump_enabled_p ())
2655                 {
2656                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2657                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2658                 }
2659
2660               swap_ssa_operands (next_stmt,
2661                                  gimple_assign_rhs1_ptr (next_stmt),
2662                                  gimple_assign_rhs2_ptr (next_stmt));
2663               update_stmt (next_stmt);
2664
2665               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2666                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2667             }
2668           else
2669             return false;
2670         }
2671
2672       lhs = gimple_assign_lhs (next_stmt);
2673       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2674     }
2675
2676   /* Save the chain for further analysis in SLP detection.  */
2677   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2678   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2679   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2680
2681   return true;
2682 }
2683
2684 /* Return true if we need an in-order reduction for operation CODE
2685    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2686    overflow must wrap.  */
2687
2688 static bool
2689 needs_fold_left_reduction_p (tree type, tree_code code,
2690                              bool need_wrapping_integral_overflow)
2691 {
2692   /* CHECKME: check for !flag_finite_math_only too?  */
2693   if (SCALAR_FLOAT_TYPE_P (type))
2694     switch (code)
2695       {
2696       case MIN_EXPR:
2697       case MAX_EXPR:
2698         return false;
2699
2700       default:
2701         return !flag_associative_math;
2702       }
2703
2704   if (INTEGRAL_TYPE_P (type))
2705     {
2706       if (!operation_no_trapping_overflow (type, code))
2707         return true;
2708       if (need_wrapping_integral_overflow
2709           && !TYPE_OVERFLOW_WRAPS (type)
2710           && operation_can_overflow (code))
2711         return true;
2712       return false;
2713     }
2714
2715   if (SAT_FIXED_POINT_TYPE_P (type))
2716     return true;
2717
2718   return false;
2719 }
2720
2721 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2722    reduction operation CODE has a handled computation expression.  */
2723
2724 bool
2725 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2726                       enum tree_code code)
2727 {
2728   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2729   auto_bitmap visited;
2730   tree lookfor = PHI_RESULT (phi);
2731   ssa_op_iter curri;
2732   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2733   while (USE_FROM_PTR (curr) != loop_arg)
2734     curr = op_iter_next_use (&curri);
2735   curri.i = curri.numops;
2736   do
2737     {
2738       path.safe_push (std::make_pair (curri, curr));
2739       tree use = USE_FROM_PTR (curr);
2740       if (use == lookfor)
2741         break;
2742       gimple *def = SSA_NAME_DEF_STMT (use);
2743       if (gimple_nop_p (def)
2744           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2745         {
2746 pop:
2747           do
2748             {
2749               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2750               curri = x.first;
2751               curr = x.second;
2752               do
2753                 curr = op_iter_next_use (&curri);
2754               /* Skip already visited or non-SSA operands (from iterating
2755                  over PHI args).  */
2756               while (curr != NULL_USE_OPERAND_P
2757                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2758                          || ! bitmap_set_bit (visited,
2759                                               SSA_NAME_VERSION
2760                                                 (USE_FROM_PTR (curr)))));
2761             }
2762           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2763           if (curr == NULL_USE_OPERAND_P)
2764             break;
2765         }
2766       else
2767         {
2768           if (gimple_code (def) == GIMPLE_PHI)
2769             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2770           else
2771             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2772           while (curr != NULL_USE_OPERAND_P
2773                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2774                      || ! bitmap_set_bit (visited,
2775                                           SSA_NAME_VERSION
2776                                             (USE_FROM_PTR (curr)))))
2777             curr = op_iter_next_use (&curri);
2778           if (curr == NULL_USE_OPERAND_P)
2779             goto pop;
2780         }
2781     }
2782   while (1);
2783   if (dump_file && (dump_flags & TDF_DETAILS))
2784     {
2785       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2786       unsigned i;
2787       std::pair<ssa_op_iter, use_operand_p> *x;
2788       FOR_EACH_VEC_ELT (path, i, x)
2789         {
2790           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2791           dump_printf (MSG_NOTE, " ");
2792         }
2793       dump_printf (MSG_NOTE, "\n");
2794     }
2795
2796   /* Check whether the reduction path detected is valid.  */
2797   bool fail = path.length () == 0;
2798   bool neg = false;
2799   for (unsigned i = 1; i < path.length (); ++i)
2800     {
2801       gimple *use_stmt = USE_STMT (path[i].second);
2802       tree op = USE_FROM_PTR (path[i].second);
2803       if (! has_single_use (op)
2804           || ! is_gimple_assign (use_stmt))
2805         {
2806           fail = true;
2807           break;
2808         }
2809       if (gimple_assign_rhs_code (use_stmt) != code)
2810         {
2811           if (code == PLUS_EXPR
2812               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2813             {
2814               /* Track whether we negate the reduction value each iteration.  */
2815               if (gimple_assign_rhs2 (use_stmt) == op)
2816                 neg = ! neg;
2817             }
2818           else
2819             {
2820               fail = true;
2821               break;
2822             }
2823         }
2824     }
2825   return ! fail && ! neg;
2826 }
2827
2828
2829 /* Function vect_is_simple_reduction
2830
2831    (1) Detect a cross-iteration def-use cycle that represents a simple
2832    reduction computation.  We look for the following pattern:
2833
2834    loop_header:
2835      a1 = phi < a0, a2 >
2836      a3 = ...
2837      a2 = operation (a3, a1)
2838
2839    or
2840
2841    a3 = ...
2842    loop_header:
2843      a1 = phi < a0, a2 >
2844      a2 = operation (a3, a1)
2845
2846    such that:
2847    1. operation is commutative and associative and it is safe to
2848       change the order of the computation
2849    2. no uses for a2 in the loop (a2 is used out of the loop)
2850    3. no uses of a1 in the loop besides the reduction operation
2851    4. no uses of a1 outside the loop.
2852
2853    Conditions 1,4 are tested here.
2854    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2855
2856    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2857    nested cycles.
2858
2859    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2860    reductions:
2861
2862      a1 = phi < a0, a2 >
2863      inner loop (def of a3)
2864      a2 = phi < a3 >
2865
2866    (4) Detect condition expressions, ie:
2867      for (int i = 0; i < N; i++)
2868        if (a[i] < val)
2869         ret_val = a[i];
2870
2871 */
2872
2873 static gimple *
2874 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2875                           bool *double_reduc,
2876                           bool need_wrapping_integral_overflow,
2877                           enum vect_reduction_type *v_reduc_type)
2878 {
2879   struct loop *loop = (gimple_bb (phi))->loop_father;
2880   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2881   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2882   enum tree_code orig_code, code;
2883   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2884   tree type;
2885   int nloop_uses;
2886   tree name;
2887   imm_use_iterator imm_iter;
2888   use_operand_p use_p;
2889   bool phi_def;
2890
2891   *double_reduc = false;
2892   *v_reduc_type = TREE_CODE_REDUCTION;
2893
2894   tree phi_name = PHI_RESULT (phi);
2895   /* ???  If there are no uses of the PHI result the inner loop reduction
2896      won't be detected as possibly double-reduction by vectorizable_reduction
2897      because that tries to walk the PHI arg from the preheader edge which
2898      can be constant.  See PR60382.  */
2899   if (has_zero_uses (phi_name))
2900     return NULL;
2901   nloop_uses = 0;
2902   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2903     {
2904       gimple *use_stmt = USE_STMT (use_p);
2905       if (is_gimple_debug (use_stmt))
2906         continue;
2907
2908       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2909         {
2910           if (dump_enabled_p ())
2911             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912                              "intermediate value used outside loop.\n");
2913
2914           return NULL;
2915         }
2916
2917       nloop_uses++;
2918       if (nloop_uses > 1)
2919         {
2920           if (dump_enabled_p ())
2921             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2922                              "reduction value used in loop.\n");
2923           return NULL;
2924         }
2925
2926       phi_use_stmt = use_stmt;
2927     }
2928
2929   edge latch_e = loop_latch_edge (loop);
2930   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2931   if (TREE_CODE (loop_arg) != SSA_NAME)
2932     {
2933       if (dump_enabled_p ())
2934         {
2935           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2936                            "reduction: not ssa_name: ");
2937           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2938           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2939         }
2940       return NULL;
2941     }
2942
2943   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2944   if (is_gimple_assign (def_stmt))
2945     {
2946       name = gimple_assign_lhs (def_stmt);
2947       phi_def = false;
2948     }
2949   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2950     {
2951       name = PHI_RESULT (def_stmt);
2952       phi_def = true;
2953     }
2954   else
2955     {
2956       if (dump_enabled_p ())
2957         {
2958           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2959                            "reduction: unhandled reduction operation: ");
2960           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2961         }
2962       return NULL;
2963     }
2964
2965   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2966     return NULL;
2967
2968   nloop_uses = 0;
2969   auto_vec<gphi *, 3> lcphis;
2970   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2971     {
2972       gimple *use_stmt = USE_STMT (use_p);
2973       if (is_gimple_debug (use_stmt))
2974         continue;
2975       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2976         nloop_uses++;
2977       else
2978         /* We can have more than one loop-closed PHI.  */
2979         lcphis.safe_push (as_a <gphi *> (use_stmt));
2980       if (nloop_uses > 1)
2981         {
2982           if (dump_enabled_p ())
2983             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2984                              "reduction used in loop.\n");
2985           return NULL;
2986         }
2987     }
2988
2989   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2990      defined in the inner loop.  */
2991   if (phi_def)
2992     {
2993       op1 = PHI_ARG_DEF (def_stmt, 0);
2994
2995       if (gimple_phi_num_args (def_stmt) != 1
2996           || TREE_CODE (op1) != SSA_NAME)
2997         {
2998           if (dump_enabled_p ())
2999             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3000                              "unsupported phi node definition.\n");
3001
3002           return NULL;
3003         }
3004
3005       def1 = SSA_NAME_DEF_STMT (op1);
3006       if (gimple_bb (def1)
3007           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3008           && loop->inner
3009           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3010           && is_gimple_assign (def1)
3011           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3012         {
3013           if (dump_enabled_p ())
3014             report_vect_op (MSG_NOTE, def_stmt,
3015                             "detected double reduction: ");
3016
3017           *double_reduc = true;
3018           return def_stmt;
3019         }
3020
3021       return NULL;
3022     }
3023
3024   /* If we are vectorizing an inner reduction we are executing that
3025      in the original order only in case we are not dealing with a
3026      double reduction.  */
3027   bool check_reduction = true;
3028   if (flow_loop_nested_p (vect_loop, loop))
3029     {
3030       gphi *lcphi;
3031       unsigned i;
3032       check_reduction = false;
3033       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3034         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3035           {
3036             gimple *use_stmt = USE_STMT (use_p);
3037             if (is_gimple_debug (use_stmt))
3038               continue;
3039             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3040               check_reduction = true;
3041           }
3042     }
3043
3044   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3045   code = orig_code = gimple_assign_rhs_code (def_stmt);
3046
3047   /* We can handle "res -= x[i]", which is non-associative by
3048      simply rewriting this into "res += -x[i]".  Avoid changing
3049      gimple instruction for the first simple tests and only do this
3050      if we're allowed to change code at all.  */
3051   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3052     code = PLUS_EXPR;
3053
3054   if (code == COND_EXPR)
3055     {
3056       if (! nested_in_vect_loop)
3057         *v_reduc_type = COND_REDUCTION;
3058
3059       op3 = gimple_assign_rhs1 (def_stmt);
3060       if (COMPARISON_CLASS_P (op3))
3061         {
3062           op4 = TREE_OPERAND (op3, 1);
3063           op3 = TREE_OPERAND (op3, 0);
3064         }
3065       if (op3 == phi_name || op4 == phi_name)
3066         {
3067           if (dump_enabled_p ())
3068             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3069                             "reduction: condition depends on previous"
3070                             " iteration: ");
3071           return NULL;
3072         }
3073
3074       op1 = gimple_assign_rhs2 (def_stmt);
3075       op2 = gimple_assign_rhs3 (def_stmt);
3076     }
3077   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3078     {
3079       if (dump_enabled_p ())
3080         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3081                         "reduction: not commutative/associative: ");
3082       return NULL;
3083     }
3084   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3085     {
3086       op1 = gimple_assign_rhs1 (def_stmt);
3087       op2 = gimple_assign_rhs2 (def_stmt);
3088     }
3089   else
3090     {
3091       if (dump_enabled_p ())
3092         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3093                         "reduction: not handled operation: ");
3094       return NULL;
3095     }
3096
3097   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3098     {
3099       if (dump_enabled_p ())
3100         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3101                         "reduction: both uses not ssa_names: ");
3102
3103       return NULL;
3104     }
3105
3106   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3107   if ((TREE_CODE (op1) == SSA_NAME
3108        && !types_compatible_p (type,TREE_TYPE (op1)))
3109       || (TREE_CODE (op2) == SSA_NAME
3110           && !types_compatible_p (type, TREE_TYPE (op2)))
3111       || (op3 && TREE_CODE (op3) == SSA_NAME
3112           && !types_compatible_p (type, TREE_TYPE (op3)))
3113       || (op4 && TREE_CODE (op4) == SSA_NAME
3114           && !types_compatible_p (type, TREE_TYPE (op4))))
3115     {
3116       if (dump_enabled_p ())
3117         {
3118           dump_printf_loc (MSG_NOTE, vect_location,
3119                            "reduction: multiple types: operation type: ");
3120           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3121           dump_printf (MSG_NOTE, ", operands types: ");
3122           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3123                              TREE_TYPE (op1));
3124           dump_printf (MSG_NOTE, ",");
3125           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3126                              TREE_TYPE (op2));
3127           if (op3)
3128             {
3129               dump_printf (MSG_NOTE, ",");
3130               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3131                                  TREE_TYPE (op3));
3132             }
3133
3134           if (op4)
3135             {
3136               dump_printf (MSG_NOTE, ",");
3137               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3138                                  TREE_TYPE (op4));
3139             }
3140           dump_printf (MSG_NOTE, "\n");
3141         }
3142
3143       return NULL;
3144     }
3145
3146   /* Check whether it's ok to change the order of the computation.
3147      Generally, when vectorizing a reduction we change the order of the
3148      computation.  This may change the behavior of the program in some
3149      cases, so we need to check that this is ok.  One exception is when
3150      vectorizing an outer-loop: the inner-loop is executed sequentially,
3151      and therefore vectorizing reductions in the inner-loop during
3152      outer-loop vectorization is safe.  */
3153   if (check_reduction
3154       && *v_reduc_type == TREE_CODE_REDUCTION
3155       && needs_fold_left_reduction_p (type, code,
3156                                       need_wrapping_integral_overflow))
3157     *v_reduc_type = FOLD_LEFT_REDUCTION;
3158
3159   /* Reduction is safe. We're dealing with one of the following:
3160      1) integer arithmetic and no trapv
3161      2) floating point arithmetic, and special flags permit this optimization
3162      3) nested cycle (i.e., outer loop vectorization).  */
3163   if (TREE_CODE (op1) == SSA_NAME)
3164     def1 = SSA_NAME_DEF_STMT (op1);
3165
3166   if (TREE_CODE (op2) == SSA_NAME)
3167     def2 = SSA_NAME_DEF_STMT (op2);
3168
3169   if (code != COND_EXPR
3170       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3171     {
3172       if (dump_enabled_p ())
3173         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3174       return NULL;
3175     }
3176
3177   /* Check that one def is the reduction def, defined by PHI,
3178      the other def is either defined in the loop ("vect_internal_def"),
3179      or it's an induction (defined by a loop-header phi-node).  */
3180
3181   if (def2 && def2 == phi
3182       && (code == COND_EXPR
3183           || !def1 || gimple_nop_p (def1)
3184           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3185           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3186               && (is_gimple_assign (def1)
3187                   || is_gimple_call (def1)
3188                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3189                       == vect_induction_def
3190                   || (gimple_code (def1) == GIMPLE_PHI
3191                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3192                           == vect_internal_def
3193                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3194     {
3195       if (dump_enabled_p ())
3196         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3197       return def_stmt;
3198     }
3199
3200   if (def1 && def1 == phi
3201       && (code == COND_EXPR
3202           || !def2 || gimple_nop_p (def2)
3203           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3204           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3205               && (is_gimple_assign (def2)
3206                   || is_gimple_call (def2)
3207                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3208                        == vect_induction_def
3209                   || (gimple_code (def2) == GIMPLE_PHI
3210                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3211                            == vect_internal_def
3212                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3213     {
3214       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3215         {
3216           /* Check if we can swap operands (just for simplicity - so that
3217              the rest of the code can assume that the reduction variable
3218              is always the last (second) argument).  */
3219           if (code == COND_EXPR)
3220             {
3221               /* Swap cond_expr by inverting the condition.  */
3222               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3223               enum tree_code invert_code = ERROR_MARK;
3224               enum tree_code cond_code = TREE_CODE (cond_expr);
3225
3226               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3227                 {
3228                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3229                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3230                 }
3231               if (invert_code != ERROR_MARK)
3232                 {
3233                   TREE_SET_CODE (cond_expr, invert_code);
3234                   swap_ssa_operands (def_stmt,
3235                                      gimple_assign_rhs2_ptr (def_stmt),
3236                                      gimple_assign_rhs3_ptr (def_stmt));
3237                 }
3238               else
3239                 {
3240                   if (dump_enabled_p ())
3241                     report_vect_op (MSG_NOTE, def_stmt,
3242                                     "detected reduction: cannot swap operands "
3243                                     "for cond_expr");
3244                   return NULL;
3245                 }
3246             }
3247           else
3248             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3249                                gimple_assign_rhs2_ptr (def_stmt));
3250
3251           if (dump_enabled_p ())
3252             report_vect_op (MSG_NOTE, def_stmt,
3253                             "detected reduction: need to swap operands: ");
3254
3255           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3256             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3257         }
3258       else
3259         {
3260           if (dump_enabled_p ())
3261             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3262         }
3263
3264       return def_stmt;
3265     }
3266
3267   /* Try to find SLP reduction chain.  */
3268   if (! nested_in_vect_loop
3269       && code != COND_EXPR
3270       && orig_code != MINUS_EXPR
3271       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3272     {
3273       if (dump_enabled_p ())
3274         report_vect_op (MSG_NOTE, def_stmt,
3275                         "reduction: detected reduction chain: ");
3276
3277       return def_stmt;
3278     }
3279
3280   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3281   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3282   while (first)
3283     {
3284       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3285       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3286       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3287       first = next;
3288     }
3289
3290   /* Look for the expression computing loop_arg from loop PHI result.  */
3291   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3292                             code))
3293     return def_stmt;
3294
3295   if (dump_enabled_p ())
3296     {
3297       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3298                       "reduction: unknown pattern: ");
3299     }
3300
3301   return NULL;
3302 }
3303
3304 /* Wrapper around vect_is_simple_reduction, which will modify code
3305    in-place if it enables detection of more reductions.  Arguments
3306    as there.  */
3307
3308 gimple *
3309 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3310                              bool *double_reduc,
3311                              bool need_wrapping_integral_overflow)
3312 {
3313   enum vect_reduction_type v_reduc_type;
3314   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3315                                           need_wrapping_integral_overflow,
3316                                           &v_reduc_type);
3317   if (def)
3318     {
3319       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3320       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3321       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3322       reduc_def_info = vinfo_for_stmt (def);
3323       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3324       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3325     }
3326   return def;
3327 }
3328
3329 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3330 int
3331 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3332                              int *peel_iters_epilogue,
3333                              stmt_vector_for_cost *scalar_cost_vec,
3334                              stmt_vector_for_cost *prologue_cost_vec,
3335                              stmt_vector_for_cost *epilogue_cost_vec)
3336 {
3337   int retval = 0;
3338   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3339
3340   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3341     {
3342       *peel_iters_epilogue = assumed_vf / 2;
3343       if (dump_enabled_p ())
3344         dump_printf_loc (MSG_NOTE, vect_location,
3345                          "cost model: epilogue peel iters set to vf/2 "
3346                          "because loop iterations are unknown .\n");
3347
3348       /* If peeled iterations are known but number of scalar loop
3349          iterations are unknown, count a taken branch per peeled loop.  */
3350       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3351                                  NULL, 0, vect_prologue);
3352       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3353                                  NULL, 0, vect_epilogue);
3354     }
3355   else
3356     {
3357       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3358       peel_iters_prologue = niters < peel_iters_prologue ?
3359                             niters : peel_iters_prologue;
3360       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3361       /* If we need to peel for gaps, but no peeling is required, we have to
3362          peel VF iterations.  */
3363       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3364         *peel_iters_epilogue = assumed_vf;
3365     }
3366
3367   stmt_info_for_cost *si;
3368   int j;
3369   if (peel_iters_prologue)
3370     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3371         {
3372           stmt_vec_info stmt_info
3373             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3374           retval += record_stmt_cost (prologue_cost_vec,
3375                                       si->count * peel_iters_prologue,
3376                                       si->kind, stmt_info, si->misalign,
3377                                       vect_prologue);
3378         }
3379   if (*peel_iters_epilogue)
3380     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3381         {
3382           stmt_vec_info stmt_info
3383             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3384           retval += record_stmt_cost (epilogue_cost_vec,
3385                                       si->count * *peel_iters_epilogue,
3386                                       si->kind, stmt_info, si->misalign,
3387                                       vect_epilogue);
3388         }
3389
3390   return retval;
3391 }
3392
3393 /* Function vect_estimate_min_profitable_iters
3394
3395    Return the number of iterations required for the vector version of the
3396    loop to be profitable relative to the cost of the scalar version of the
3397    loop.
3398
3399    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3400    of iterations for vectorization.  -1 value means loop vectorization
3401    is not profitable.  This returned value may be used for dynamic
3402    profitability check.
3403
3404    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3405    for static check against estimated number of iterations.  */
3406
3407 static void
3408 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3409                                     int *ret_min_profitable_niters,
3410                                     int *ret_min_profitable_estimate)
3411 {
3412   int min_profitable_iters;
3413   int min_profitable_estimate;
3414   int peel_iters_prologue;
3415   int peel_iters_epilogue;
3416   unsigned vec_inside_cost = 0;
3417   int vec_outside_cost = 0;
3418   unsigned vec_prologue_cost = 0;
3419   unsigned vec_epilogue_cost = 0;
3420   int scalar_single_iter_cost = 0;
3421   int scalar_outside_cost = 0;
3422   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3423   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3424   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3425
3426   /* Cost model disabled.  */
3427   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3428     {
3429       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3430       *ret_min_profitable_niters = 0;
3431       *ret_min_profitable_estimate = 0;
3432       return;
3433     }
3434
3435   /* Requires loop versioning tests to handle misalignment.  */
3436   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3437     {
3438       /*  FIXME: Make cost depend on complexity of individual check.  */
3439       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3440       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3441                             vect_prologue);
3442       dump_printf (MSG_NOTE,
3443                    "cost model: Adding cost of checks for loop "
3444                    "versioning to treat misalignment.\n");
3445     }
3446
3447   /* Requires loop versioning with alias checks.  */
3448   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3449     {
3450       /*  FIXME: Make cost depend on complexity of individual check.  */
3451       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3452       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3453                             vect_prologue);
3454       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3455       if (len)
3456         /* Count LEN - 1 ANDs and LEN comparisons.  */
3457         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3458                               NULL, 0, vect_prologue);
3459       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3460       if (len)
3461         {
3462           /* Count LEN - 1 ANDs and LEN comparisons.  */
3463           unsigned int nstmts = len * 2 - 1;
3464           /* +1 for each bias that needs adding.  */
3465           for (unsigned int i = 0; i < len; ++i)
3466             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3467               nstmts += 1;
3468           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3469                                 NULL, 0, vect_prologue);
3470         }
3471       dump_printf (MSG_NOTE,
3472                    "cost model: Adding cost of checks for loop "
3473                    "versioning aliasing.\n");
3474     }
3475
3476   /* Requires loop versioning with niter checks.  */
3477   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3478     {
3479       /*  FIXME: Make cost depend on complexity of individual check.  */
3480       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3481                             vect_prologue);
3482       dump_printf (MSG_NOTE,
3483                    "cost model: Adding cost of checks for loop "
3484                    "versioning niters.\n");
3485     }
3486
3487   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3488     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3489                           vect_prologue);
3490
3491   /* Count statements in scalar loop.  Using this as scalar cost for a single
3492      iteration for now.
3493
3494      TODO: Add outer loop support.
3495
3496      TODO: Consider assigning different costs to different scalar
3497      statements.  */
3498
3499   scalar_single_iter_cost
3500     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3501
3502   /* Add additional cost for the peeled instructions in prologue and epilogue
3503      loop.  (For fully-masked loops there will be no peeling.)
3504
3505      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3506      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3507
3508      TODO: Build an expression that represents peel_iters for prologue and
3509      epilogue to be used in a run-time test.  */
3510
3511   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3512     {
3513       peel_iters_prologue = 0;
3514       peel_iters_epilogue = 0;
3515
3516       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3517         {
3518           /* We need to peel exactly one iteration.  */
3519           peel_iters_epilogue += 1;
3520           stmt_info_for_cost *si;
3521           int j;
3522           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3523                             j, si)
3524             {
3525               struct _stmt_vec_info *stmt_info
3526                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3527               (void) add_stmt_cost (target_cost_data, si->count,
3528                                     si->kind, stmt_info, si->misalign,
3529                                     vect_epilogue);
3530             }
3531         }
3532     }
3533   else if (npeel < 0)
3534     {
3535       peel_iters_prologue = assumed_vf / 2;
3536       dump_printf (MSG_NOTE, "cost model: "
3537                    "prologue peel iters set to vf/2.\n");
3538
3539       /* If peeling for alignment is unknown, loop bound of main loop becomes
3540          unknown.  */
3541       peel_iters_epilogue = assumed_vf / 2;
3542       dump_printf (MSG_NOTE, "cost model: "
3543                    "epilogue peel iters set to vf/2 because "
3544                    "peeling for alignment is unknown.\n");
3545
3546       /* If peeled iterations are unknown, count a taken branch and a not taken
3547          branch per peeled loop. Even if scalar loop iterations are known,
3548          vector iterations are not known since peeled prologue iterations are
3549          not known. Hence guards remain the same.  */
3550       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3551                             NULL, 0, vect_prologue);
3552       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3553                             NULL, 0, vect_prologue);
3554       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3555                             NULL, 0, vect_epilogue);
3556       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3557                             NULL, 0, vect_epilogue);
3558       stmt_info_for_cost *si;
3559       int j;
3560       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3561         {
3562           struct _stmt_vec_info *stmt_info
3563             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3564           (void) add_stmt_cost (target_cost_data,
3565                                 si->count * peel_iters_prologue,
3566                                 si->kind, stmt_info, si->misalign,
3567                                 vect_prologue);
3568           (void) add_stmt_cost (target_cost_data,
3569                                 si->count * peel_iters_epilogue,
3570                                 si->kind, stmt_info, si->misalign,
3571                                 vect_epilogue);
3572         }
3573     }
3574   else
3575     {
3576       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3577       stmt_info_for_cost *si;
3578       int j;
3579       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3580
3581       prologue_cost_vec.create (2);
3582       epilogue_cost_vec.create (2);
3583       peel_iters_prologue = npeel;
3584
3585       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3586                                           &peel_iters_epilogue,
3587                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3588                                             (loop_vinfo),
3589                                           &prologue_cost_vec,
3590                                           &epilogue_cost_vec);
3591
3592       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3593         {
3594           struct _stmt_vec_info *stmt_info
3595             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3596           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3597                                 si->misalign, vect_prologue);
3598         }
3599
3600       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3601         {
3602           struct _stmt_vec_info *stmt_info
3603             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3604           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3605                                 si->misalign, vect_epilogue);
3606         }
3607
3608       prologue_cost_vec.release ();
3609       epilogue_cost_vec.release ();
3610     }
3611
3612   /* FORNOW: The scalar outside cost is incremented in one of the
3613      following ways:
3614
3615      1. The vectorizer checks for alignment and aliasing and generates
3616      a condition that allows dynamic vectorization.  A cost model
3617      check is ANDED with the versioning condition.  Hence scalar code
3618      path now has the added cost of the versioning check.
3619
3620        if (cost > th & versioning_check)
3621          jmp to vector code
3622
3623      Hence run-time scalar is incremented by not-taken branch cost.
3624
3625      2. The vectorizer then checks if a prologue is required.  If the
3626      cost model check was not done before during versioning, it has to
3627      be done before the prologue check.
3628
3629        if (cost <= th)
3630          prologue = scalar_iters
3631        if (prologue == 0)
3632          jmp to vector code
3633        else
3634          execute prologue
3635        if (prologue == num_iters)
3636          go to exit
3637
3638      Hence the run-time scalar cost is incremented by a taken branch,
3639      plus a not-taken branch, plus a taken branch cost.
3640
3641      3. The vectorizer then checks if an epilogue is required.  If the
3642      cost model check was not done before during prologue check, it
3643      has to be done with the epilogue check.
3644
3645        if (prologue == 0)
3646          jmp to vector code
3647        else
3648          execute prologue
3649        if (prologue == num_iters)
3650          go to exit
3651        vector code:
3652          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3653            jmp to epilogue
3654
3655      Hence the run-time scalar cost should be incremented by 2 taken
3656      branches.
3657
3658      TODO: The back end may reorder the BBS's differently and reverse
3659      conditions/branch directions.  Change the estimates below to
3660      something more reasonable.  */
3661
3662   /* If the number of iterations is known and we do not do versioning, we can
3663      decide whether to vectorize at compile time.  Hence the scalar version
3664      do not carry cost model guard costs.  */
3665   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3666       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3667     {
3668       /* Cost model check occurs at versioning.  */
3669       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3670         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3671       else
3672         {
3673           /* Cost model check occurs at prologue generation.  */
3674           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3675             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3676               + vect_get_stmt_cost (cond_branch_not_taken);
3677           /* Cost model check occurs at epilogue generation.  */
3678           else
3679             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3680         }
3681     }
3682
3683   /* Complete the target-specific cost calculations.  */
3684   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3685                &vec_inside_cost, &vec_epilogue_cost);
3686
3687   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3688
3689   if (dump_enabled_p ())
3690     {
3691       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3692       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3693                    vec_inside_cost);
3694       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3695                    vec_prologue_cost);
3696       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3697                    vec_epilogue_cost);
3698       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3699                    scalar_single_iter_cost);
3700       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3701                    scalar_outside_cost);
3702       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3703                    vec_outside_cost);
3704       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3705                    peel_iters_prologue);
3706       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3707                    peel_iters_epilogue);
3708     }
3709
3710   /* Calculate number of iterations required to make the vector version
3711      profitable, relative to the loop bodies only.  The following condition
3712      must hold true:
3713      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3714      where
3715      SIC = scalar iteration cost, VIC = vector iteration cost,
3716      VOC = vector outside cost, VF = vectorization factor,
3717      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3718      SOC = scalar outside cost for run time cost model check.  */
3719
3720   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3721     {
3722       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3723                               * assumed_vf
3724                               - vec_inside_cost * peel_iters_prologue
3725                               - vec_inside_cost * peel_iters_epilogue);
3726       if (min_profitable_iters <= 0)
3727         min_profitable_iters = 0;
3728       else
3729         {
3730           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3731                                    - vec_inside_cost);
3732
3733           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3734               <= (((int) vec_inside_cost * min_profitable_iters)
3735                   + (((int) vec_outside_cost - scalar_outside_cost)
3736                      * assumed_vf)))
3737             min_profitable_iters++;
3738         }
3739     }
3740   /* vector version will never be profitable.  */
3741   else
3742     {
3743       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3744         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3745                     "did not happen for a simd loop");
3746
3747       if (dump_enabled_p ())
3748         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3749                          "cost model: the vector iteration cost = %d "
3750                          "divided by the scalar iteration cost = %d "
3751                          "is greater or equal to the vectorization factor = %d"
3752                          ".\n",
3753                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3754       *ret_min_profitable_niters = -1;
3755       *ret_min_profitable_estimate = -1;
3756       return;
3757     }
3758
3759   dump_printf (MSG_NOTE,
3760                "  Calculated minimum iters for profitability: %d\n",
3761                min_profitable_iters);
3762
3763   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3764       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3765     /* We want the vectorized loop to execute at least once.  */
3766     min_profitable_iters = assumed_vf + peel_iters_prologue;
3767
3768   if (dump_enabled_p ())
3769     dump_printf_loc (MSG_NOTE, vect_location,
3770                      "  Runtime profitability threshold = %d\n",
3771                      min_profitable_iters);
3772
3773   *ret_min_profitable_niters = min_profitable_iters;
3774
3775   /* Calculate number of iterations required to make the vector version
3776      profitable, relative to the loop bodies only.
3777
3778      Non-vectorized variant is SIC * niters and it must win over vector
3779      variant on the expected loop trip count.  The following condition must hold true:
3780      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3781
3782   if (vec_outside_cost <= 0)
3783     min_profitable_estimate = 0;
3784   else
3785     {
3786       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3787                                  * assumed_vf
3788                                  - vec_inside_cost * peel_iters_prologue
3789                                  - vec_inside_cost * peel_iters_epilogue)
3790                                  / ((scalar_single_iter_cost * assumed_vf)
3791                                    - vec_inside_cost);
3792     }
3793   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3794   if (dump_enabled_p ())
3795     dump_printf_loc (MSG_NOTE, vect_location,
3796                      "  Static estimate profitability threshold = %d\n",
3797                      min_profitable_estimate);
3798
3799   *ret_min_profitable_estimate = min_profitable_estimate;
3800 }
3801
3802 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3803    vector elements (not bits) for a vector with NELT elements.  */
3804 static void
3805 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3806                               vec_perm_builder *sel)
3807 {
3808   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3809      by vec_perm_indices.  */
3810   sel->new_vector (nelt, 1, 3);
3811   for (unsigned int i = 0; i < 3; i++)
3812     sel->quick_push (i + offset);
3813 }
3814
3815 /* Checks whether the target supports whole-vector shifts for vectors of mode
3816    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3817    it supports vec_perm_const with masks for all necessary shift amounts.  */
3818 static bool
3819 have_whole_vector_shift (machine_mode mode)
3820 {
3821   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3822     return true;
3823
3824   /* Variable-length vectors should be handled via the optab.  */
3825   unsigned int nelt;
3826   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3827     return false;
3828
3829   vec_perm_builder sel;
3830   vec_perm_indices indices;
3831   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3832     {
3833       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3834       indices.new_vector (sel, 2, nelt);
3835       if (!can_vec_perm_const_p (mode, indices, false))
3836         return false;
3837     }
3838   return true;
3839 }
3840
3841 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3842    functions. Design better to avoid maintenance issues.  */
3843
3844 /* Function vect_model_reduction_cost.
3845
3846    Models cost for a reduction operation, including the vector ops
3847    generated within the strip-mine loop, the initial definition before
3848    the loop, and the epilogue code that must be generated.  */
3849
3850 static void
3851 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3852                            int ncopies, stmt_vector_for_cost *cost_vec)
3853 {
3854   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3855   enum tree_code code;
3856   optab optab;
3857   tree vectype;
3858   gimple *orig_stmt;
3859   machine_mode mode;
3860   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3861   struct loop *loop = NULL;
3862
3863   if (loop_vinfo)
3864     loop = LOOP_VINFO_LOOP (loop_vinfo);
3865
3866   /* Condition reductions generate two reductions in the loop.  */
3867   vect_reduction_type reduction_type
3868     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3869   if (reduction_type == COND_REDUCTION)
3870     ncopies *= 2;
3871
3872   vectype = STMT_VINFO_VECTYPE (stmt_info);
3873   mode = TYPE_MODE (vectype);
3874   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3875
3876   if (!orig_stmt)
3877     orig_stmt = STMT_VINFO_STMT (stmt_info);
3878
3879   code = gimple_assign_rhs_code (orig_stmt);
3880
3881   if (reduction_type == EXTRACT_LAST_REDUCTION
3882       || reduction_type == FOLD_LEFT_REDUCTION)
3883     {
3884       /* No extra instructions needed in the prologue.  */
3885       prologue_cost = 0;
3886
3887       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3888         /* Count one reduction-like operation per vector.  */
3889         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3890                                         stmt_info, 0, vect_body);
3891       else
3892         {
3893           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3894           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3895           inside_cost = record_stmt_cost (cost_vec, nelements,
3896                                           vec_to_scalar, stmt_info, 0,
3897                                           vect_body);
3898           inside_cost += record_stmt_cost (cost_vec, nelements,
3899                                            scalar_stmt, stmt_info, 0,
3900                                            vect_body);
3901         }
3902     }
3903   else
3904     {
3905       /* Add in cost for initial definition.
3906          For cond reduction we have four vectors: initial index, step,
3907          initial result of the data reduction, initial value of the index
3908          reduction.  */
3909       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3910       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3911                                          scalar_to_vec, stmt_info, 0,
3912                                          vect_prologue);
3913
3914       /* Cost of reduction op inside loop.  */
3915       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3916                                       stmt_info, 0, vect_body);
3917     }
3918
3919   /* Determine cost of epilogue code.
3920
3921      We have a reduction operator that will reduce the vector in one statement.
3922      Also requires scalar extract.  */
3923
3924   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3925     {
3926       if (reduc_fn != IFN_LAST)
3927         {
3928           if (reduction_type == COND_REDUCTION)
3929             {
3930               /* An EQ stmt and an COND_EXPR stmt.  */
3931               epilogue_cost += record_stmt_cost (cost_vec, 2,
3932                                                  vector_stmt, stmt_info, 0,
3933                                                  vect_epilogue);
3934               /* Reduction of the max index and a reduction of the found
3935                  values.  */
3936               epilogue_cost += record_stmt_cost (cost_vec, 2,
3937                                                  vec_to_scalar, stmt_info, 0,
3938                                                  vect_epilogue);
3939               /* A broadcast of the max value.  */
3940               epilogue_cost += record_stmt_cost (cost_vec, 1,
3941                                                  scalar_to_vec, stmt_info, 0,
3942                                                  vect_epilogue);
3943             }
3944           else
3945             {
3946               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3947                                                  stmt_info, 0, vect_epilogue);
3948               epilogue_cost += record_stmt_cost (cost_vec, 1,
3949                                                  vec_to_scalar, stmt_info, 0,
3950                                                  vect_epilogue);
3951             }
3952         }
3953       else if (reduction_type == COND_REDUCTION)
3954         {
3955           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3956           /* Extraction of scalar elements.  */
3957           epilogue_cost += record_stmt_cost (cost_vec,
3958                                              2 * estimated_nunits,
3959                                              vec_to_scalar, stmt_info, 0,
3960                                              vect_epilogue);
3961           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3962           epilogue_cost += record_stmt_cost (cost_vec,
3963                                              2 * estimated_nunits - 3,
3964                                              scalar_stmt, stmt_info, 0,
3965                                              vect_epilogue);
3966         }
3967       else if (reduction_type == EXTRACT_LAST_REDUCTION
3968                || reduction_type == FOLD_LEFT_REDUCTION)
3969         /* No extra instructions need in the epilogue.  */
3970         ;
3971       else
3972         {
3973           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3974           tree bitsize =
3975             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3976           int element_bitsize = tree_to_uhwi (bitsize);
3977           int nelements = vec_size_in_bits / element_bitsize;
3978
3979           if (code == COND_EXPR)
3980             code = MAX_EXPR;
3981
3982           optab = optab_for_tree_code (code, vectype, optab_default);
3983
3984           /* We have a whole vector shift available.  */
3985           if (optab != unknown_optab
3986               && VECTOR_MODE_P (mode)
3987               && optab_handler (optab, mode) != CODE_FOR_nothing
3988               && have_whole_vector_shift (mode))
3989             {
3990               /* Final reduction via vector shifts and the reduction operator.
3991                  Also requires scalar extract.  */
3992               epilogue_cost += record_stmt_cost (cost_vec,
3993                                                  exact_log2 (nelements) * 2,
3994                                                  vector_stmt, stmt_info, 0,
3995                                                  vect_epilogue);
3996               epilogue_cost += record_stmt_cost (cost_vec, 1,
3997                                                  vec_to_scalar, stmt_info, 0,
3998                                                  vect_epilogue);
3999             }
4000           else
4001             /* Use extracts and reduction op for final reduction.  For N
4002                elements, we have N extracts and N-1 reduction ops.  */
4003             epilogue_cost += record_stmt_cost (cost_vec,
4004                                                nelements + nelements - 1,
4005                                                vector_stmt, stmt_info, 0,
4006                                                vect_epilogue);
4007         }
4008     }
4009
4010   if (dump_enabled_p ())
4011     dump_printf (MSG_NOTE,
4012                  "vect_model_reduction_cost: inside_cost = %d, "
4013                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4014                  prologue_cost, epilogue_cost);
4015 }
4016
4017
4018 /* Function vect_model_induction_cost.
4019
4020    Models cost for induction operations.  */
4021
4022 static void
4023 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4024                            stmt_vector_for_cost *cost_vec)
4025 {
4026   unsigned inside_cost, prologue_cost;
4027
4028   if (PURE_SLP_STMT (stmt_info))
4029     return;
4030
4031   /* loop cost for vec_loop.  */
4032   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4033                                   stmt_info, 0, vect_body);
4034
4035   /* prologue cost for vec_init and vec_step.  */
4036   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4037                                     stmt_info, 0, vect_prologue);
4038
4039   if (dump_enabled_p ())
4040     dump_printf_loc (MSG_NOTE, vect_location,
4041                      "vect_model_induction_cost: inside_cost = %d, "
4042                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4043 }
4044
4045
4046
4047 /* Function get_initial_def_for_reduction
4048
4049    Input:
4050    STMT - a stmt that performs a reduction operation in the loop.
4051    INIT_VAL - the initial value of the reduction variable
4052
4053    Output:
4054    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4055         of the reduction (used for adjusting the epilog - see below).
4056    Return a vector variable, initialized according to the operation that STMT
4057         performs. This vector will be used as the initial value of the
4058         vector of partial results.
4059
4060    Option1 (adjust in epilog): Initialize the vector as follows:
4061      add/bit or/xor:    [0,0,...,0,0]
4062      mult/bit and:      [1,1,...,1,1]
4063      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4064    and when necessary (e.g. add/mult case) let the caller know
4065    that it needs to adjust the result by init_val.
4066
4067    Option2: Initialize the vector as follows:
4068      add/bit or/xor:    [init_val,0,0,...,0]
4069      mult/bit and:      [init_val,1,1,...,1]
4070      min/max/cond_expr: [init_val,init_val,...,init_val]
4071    and no adjustments are needed.
4072
4073    For example, for the following code:
4074
4075    s = init_val;
4076    for (i=0;i<n;i++)
4077      s = s + a[i];
4078
4079    STMT is 's = s + a[i]', and the reduction variable is 's'.
4080    For a vector of 4 units, we want to return either [0,0,0,init_val],
4081    or [0,0,0,0] and let the caller know that it needs to adjust
4082    the result at the end by 'init_val'.
4083
4084    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4085    initialization vector is simpler (same element in all entries), if
4086    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4087
4088    A cost model should help decide between these two schemes.  */
4089
4090 tree
4091 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4092                                tree *adjustment_def)
4093 {
4094   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4095   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4096   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4097   tree scalar_type = TREE_TYPE (init_val);
4098   tree vectype = get_vectype_for_scalar_type (scalar_type);
4099   enum tree_code code = gimple_assign_rhs_code (stmt);
4100   tree def_for_init;
4101   tree init_def;
4102   bool nested_in_vect_loop = false;
4103   REAL_VALUE_TYPE real_init_val = dconst0;
4104   int int_init_val = 0;
4105   gimple *def_stmt = NULL;
4106   gimple_seq stmts = NULL;
4107
4108   gcc_assert (vectype);
4109
4110   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4111               || SCALAR_FLOAT_TYPE_P (scalar_type));
4112
4113   if (nested_in_vect_loop_p (loop, stmt))
4114     nested_in_vect_loop = true;
4115   else
4116     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4117
4118   /* In case of double reduction we only create a vector variable to be put
4119      in the reduction phi node.  The actual statement creation is done in
4120      vect_create_epilog_for_reduction.  */
4121   if (adjustment_def && nested_in_vect_loop
4122       && TREE_CODE (init_val) == SSA_NAME
4123       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4124       && gimple_code (def_stmt) == GIMPLE_PHI
4125       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4126       && vinfo_for_stmt (def_stmt)
4127       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4128           == vect_double_reduction_def)
4129     {
4130       *adjustment_def = NULL;
4131       return vect_create_destination_var (init_val, vectype);
4132     }
4133
4134   vect_reduction_type reduction_type
4135     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4136
4137   /* In case of a nested reduction do not use an adjustment def as
4138      that case is not supported by the epilogue generation correctly
4139      if ncopies is not one.  */
4140   if (adjustment_def && nested_in_vect_loop)
4141     {
4142       *adjustment_def = NULL;
4143       return vect_get_vec_def_for_operand (init_val, stmt);
4144     }
4145
4146   switch (code)
4147     {
4148     case WIDEN_SUM_EXPR:
4149     case DOT_PROD_EXPR:
4150     case SAD_EXPR:
4151     case PLUS_EXPR:
4152     case MINUS_EXPR:
4153     case BIT_IOR_EXPR:
4154     case BIT_XOR_EXPR:
4155     case MULT_EXPR:
4156     case BIT_AND_EXPR:
4157       {
4158         /* ADJUSTMENT_DEF is NULL when called from
4159            vect_create_epilog_for_reduction to vectorize double reduction.  */
4160         if (adjustment_def)
4161           *adjustment_def = init_val;
4162
4163         if (code == MULT_EXPR)
4164           {
4165             real_init_val = dconst1;
4166             int_init_val = 1;
4167           }
4168
4169         if (code == BIT_AND_EXPR)
4170           int_init_val = -1;
4171
4172         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4173           def_for_init = build_real (scalar_type, real_init_val);
4174         else
4175           def_for_init = build_int_cst (scalar_type, int_init_val);
4176
4177         if (adjustment_def)
4178           /* Option1: the first element is '0' or '1' as well.  */
4179           init_def = gimple_build_vector_from_val (&stmts, vectype,
4180                                                    def_for_init);
4181         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4182           {
4183             /* Option2 (variable length): the first element is INIT_VAL.  */
4184             init_def = gimple_build_vector_from_val (&stmts, vectype,
4185                                                      def_for_init);
4186             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4187                                      vectype, init_def, init_val);
4188           }
4189         else
4190           {
4191             /* Option2: the first element is INIT_VAL.  */
4192             tree_vector_builder elts (vectype, 1, 2);
4193             elts.quick_push (init_val);
4194             elts.quick_push (def_for_init);
4195             init_def = gimple_build_vector (&stmts, &elts);
4196           }
4197       }
4198       break;
4199
4200     case MIN_EXPR:
4201     case MAX_EXPR:
4202     case COND_EXPR:
4203       {
4204         if (adjustment_def)
4205           {
4206             *adjustment_def = NULL_TREE;
4207             if (reduction_type != COND_REDUCTION
4208                 && reduction_type != EXTRACT_LAST_REDUCTION)
4209               {
4210                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4211                 break;
4212               }
4213           }
4214         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4215         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4216       }
4217       break;
4218
4219     default:
4220       gcc_unreachable ();
4221     }
4222
4223   if (stmts)
4224     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4225   return init_def;
4226 }
4227
4228 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4229    NUMBER_OF_VECTORS is the number of vector defs to create.
4230    If NEUTRAL_OP is nonnull, introducing extra elements of that
4231    value will not change the result.  */
4232
4233 static void
4234 get_initial_defs_for_reduction (slp_tree slp_node,
4235                                 vec<tree> *vec_oprnds,
4236                                 unsigned int number_of_vectors,
4237                                 bool reduc_chain, tree neutral_op)
4238 {
4239   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4240   gimple *stmt = stmts[0];
4241   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4242   unsigned HOST_WIDE_INT nunits;
4243   unsigned j, number_of_places_left_in_vector;
4244   tree vector_type;
4245   tree vop;
4246   int group_size = stmts.length ();
4247   unsigned int vec_num, i;
4248   unsigned number_of_copies = 1;
4249   vec<tree> voprnds;
4250   voprnds.create (number_of_vectors);
4251   struct loop *loop;
4252   auto_vec<tree, 16> permute_results;
4253
4254   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4255
4256   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4257
4258   loop = (gimple_bb (stmt))->loop_father;
4259   gcc_assert (loop);
4260   edge pe = loop_preheader_edge (loop);
4261
4262   gcc_assert (!reduc_chain || neutral_op);
4263
4264   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4265      created vectors. It is greater than 1 if unrolling is performed.
4266
4267      For example, we have two scalar operands, s1 and s2 (e.g., group of
4268      strided accesses of size two), while NUNITS is four (i.e., four scalars
4269      of this type can be packed in a vector).  The output vector will contain
4270      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4271      will be 2).
4272
4273      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4274      containing the operands.
4275
4276      For example, NUNITS is four as before, and the group size is 8
4277      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4278      {s5, s6, s7, s8}.  */
4279
4280   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4281     nunits = group_size;
4282
4283   number_of_copies = nunits * number_of_vectors / group_size;
4284
4285   number_of_places_left_in_vector = nunits;
4286   bool constant_p = true;
4287   tree_vector_builder elts (vector_type, nunits, 1);
4288   elts.quick_grow (nunits);
4289   for (j = 0; j < number_of_copies; j++)
4290     {
4291       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4292         {
4293           tree op;
4294           /* Get the def before the loop.  In reduction chain we have only
4295              one initial value.  */
4296           if ((j != (number_of_copies - 1)
4297                || (reduc_chain && i != 0))
4298               && neutral_op)
4299             op = neutral_op;
4300           else
4301             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4302
4303           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4304           number_of_places_left_in_vector--;
4305           elts[number_of_places_left_in_vector] = op;
4306           if (!CONSTANT_CLASS_P (op))
4307             constant_p = false;
4308
4309           if (number_of_places_left_in_vector == 0)
4310             {
4311               gimple_seq ctor_seq = NULL;
4312               tree init;
4313               if (constant_p && !neutral_op
4314                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4315                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4316                 /* Build the vector directly from ELTS.  */
4317                 init = gimple_build_vector (&ctor_seq, &elts);
4318               else if (neutral_op)
4319                 {
4320                   /* Build a vector of the neutral value and shift the
4321                      other elements into place.  */
4322                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4323                                                        neutral_op);
4324                   int k = nunits;
4325                   while (k > 0 && elts[k - 1] == neutral_op)
4326                     k -= 1;
4327                   while (k > 0)
4328                     {
4329                       k -= 1;
4330                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4331                                            vector_type, init, elts[k]);
4332                     }
4333                 }
4334               else
4335                 {
4336                   /* First time round, duplicate ELTS to fill the
4337                      required number of vectors, then cherry pick the
4338                      appropriate result for each iteration.  */
4339                   if (vec_oprnds->is_empty ())
4340                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4341                                               number_of_vectors,
4342                                               permute_results);
4343                   init = permute_results[number_of_vectors - j - 1];
4344                 }
4345               if (ctor_seq != NULL)
4346                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4347               voprnds.quick_push (init);
4348
4349               number_of_places_left_in_vector = nunits;
4350               elts.new_vector (vector_type, nunits, 1);
4351               elts.quick_grow (nunits);
4352               constant_p = true;
4353             }
4354         }
4355     }
4356
4357   /* Since the vectors are created in the reverse order, we should invert
4358      them.  */
4359   vec_num = voprnds.length ();
4360   for (j = vec_num; j != 0; j--)
4361     {
4362       vop = voprnds[j - 1];
4363       vec_oprnds->quick_push (vop);
4364     }
4365
4366   voprnds.release ();
4367
4368   /* In case that VF is greater than the unrolling factor needed for the SLP
4369      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4370      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4371      to replicate the vectors.  */
4372   tree neutral_vec = NULL;
4373   while (number_of_vectors > vec_oprnds->length ())
4374     {
4375       if (neutral_op)
4376         {
4377           if (!neutral_vec)
4378             {
4379               gimple_seq ctor_seq = NULL;
4380               neutral_vec = gimple_build_vector_from_val
4381                 (&ctor_seq, vector_type, neutral_op);
4382               if (ctor_seq != NULL)
4383                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4384             }
4385           vec_oprnds->quick_push (neutral_vec);
4386         }
4387       else
4388         {
4389           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4390             vec_oprnds->quick_push (vop);
4391         }
4392     }
4393 }
4394
4395
4396 /* Function vect_create_epilog_for_reduction
4397
4398    Create code at the loop-epilog to finalize the result of a reduction
4399    computation.
4400
4401    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4402      reduction statements.
4403    STMT is the scalar reduction stmt that is being vectorized.
4404    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4405      number of elements that we can fit in a vectype (nunits).  In this case
4406      we have to generate more than one vector stmt - i.e - we need to "unroll"
4407      the vector stmt by a factor VF/nunits.  For more details see documentation
4408      in vectorizable_operation.
4409    REDUC_FN is the internal function for the epilog reduction.
4410    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4411      computation.
4412    REDUC_INDEX is the index of the operand in the right hand side of the
4413      statement that is defined by REDUCTION_PHI.
4414    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4415    SLP_NODE is an SLP node containing a group of reduction statements. The
4416      first one in this group is STMT.
4417    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4418      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4419      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4420      any value of the IV in the loop.
4421    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4422    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4423      null if this is not an SLP reduction
4424
4425    This function:
4426    1. Creates the reduction def-use cycles: sets the arguments for
4427       REDUCTION_PHIS:
4428       The loop-entry argument is the vectorized initial-value of the reduction.
4429       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4430       sums.
4431    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4432       by calling the function specified by REDUC_FN if available, or by
4433       other means (whole-vector shifts or a scalar loop).
4434       The function also creates a new phi node at the loop exit to preserve
4435       loop-closed form, as illustrated below.
4436
4437      The flow at the entry to this function:
4438
4439         loop:
4440           vec_def = phi <null, null>            # REDUCTION_PHI
4441           VECT_DEF = vector_stmt                # vectorized form of STMT
4442           s_loop = scalar_stmt                  # (scalar) STMT
4443         loop_exit:
4444           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4445           use <s_out0>
4446           use <s_out0>
4447
4448      The above is transformed by this function into:
4449
4450         loop:
4451           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4452           VECT_DEF = vector_stmt                # vectorized form of STMT
4453           s_loop = scalar_stmt                  # (scalar) STMT
4454         loop_exit:
4455           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4456           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4457           v_out2 = reduce <v_out1>
4458           s_out3 = extract_field <v_out2, 0>
4459           s_out4 = adjust_result <s_out3>
4460           use <s_out4>
4461           use <s_out4>
4462 */
4463
4464 static void
4465 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4466                                   gimple *reduc_def_stmt,
4467                                   int ncopies, internal_fn reduc_fn,
4468                                   vec<gimple *> reduction_phis,
4469                                   bool double_reduc,
4470                                   slp_tree slp_node,
4471                                   slp_instance slp_node_instance,
4472                                   tree induc_val, enum tree_code induc_code,
4473                                   tree neutral_op)
4474 {
4475   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4476   stmt_vec_info prev_phi_info;
4477   tree vectype;
4478   machine_mode mode;
4479   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4480   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4481   basic_block exit_bb;
4482   tree scalar_dest;
4483   tree scalar_type;
4484   gimple *new_phi = NULL, *phi;
4485   gimple_stmt_iterator exit_gsi;
4486   tree vec_dest;
4487   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4488   gimple *epilog_stmt = NULL;
4489   enum tree_code code = gimple_assign_rhs_code (stmt);
4490   gimple *exit_phi;
4491   tree bitsize;
4492   tree adjustment_def = NULL;
4493   tree vec_initial_def = NULL;
4494   tree expr, def, initial_def = NULL;
4495   tree orig_name, scalar_result;
4496   imm_use_iterator imm_iter, phi_imm_iter;
4497   use_operand_p use_p, phi_use_p;
4498   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4499   bool nested_in_vect_loop = false;
4500   auto_vec<gimple *> new_phis;
4501   auto_vec<gimple *> inner_phis;
4502   enum vect_def_type dt = vect_unknown_def_type;
4503   int j, i;
4504   auto_vec<tree> scalar_results;
4505   unsigned int group_size = 1, k, ratio;
4506   auto_vec<tree> vec_initial_defs;
4507   auto_vec<gimple *> phis;
4508   bool slp_reduc = false;
4509   bool direct_slp_reduc;
4510   tree new_phi_result;
4511   gimple *inner_phi = NULL;
4512   tree induction_index = NULL_TREE;
4513
4514   if (slp_node)
4515     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4516
4517   if (nested_in_vect_loop_p (loop, stmt))
4518     {
4519       outer_loop = loop;
4520       loop = loop->inner;
4521       nested_in_vect_loop = true;
4522       gcc_assert (!slp_node);
4523     }
4524
4525   vectype = STMT_VINFO_VECTYPE (stmt_info);
4526   gcc_assert (vectype);
4527   mode = TYPE_MODE (vectype);
4528
4529   /* 1. Create the reduction def-use cycle:
4530      Set the arguments of REDUCTION_PHIS, i.e., transform
4531
4532         loop:
4533           vec_def = phi <null, null>            # REDUCTION_PHI
4534           VECT_DEF = vector_stmt                # vectorized form of STMT
4535           ...
4536
4537      into:
4538
4539         loop:
4540           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4541           VECT_DEF = vector_stmt                # vectorized form of STMT
4542           ...
4543
4544      (in case of SLP, do it for all the phis). */
4545
4546   /* Get the loop-entry arguments.  */
4547   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4548   if (slp_node)
4549     {
4550       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4551       vec_initial_defs.reserve (vec_num);
4552       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4553                                       &vec_initial_defs, vec_num,
4554                                       GROUP_FIRST_ELEMENT (stmt_info),
4555                                       neutral_op);
4556     }
4557   else
4558     {
4559       /* Get at the scalar def before the loop, that defines the initial value
4560          of the reduction variable.  */
4561       gimple *def_stmt;
4562       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4563                                            loop_preheader_edge (loop));
4564       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4565          and we can't use zero for induc_val, use initial_def.  Similarly
4566          for REDUC_MIN and initial_def larger than the base.  */
4567       if (TREE_CODE (initial_def) == INTEGER_CST
4568           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4569               == INTEGER_INDUC_COND_REDUCTION)
4570           && !integer_zerop (induc_val)
4571           && ((induc_code == MAX_EXPR
4572                && tree_int_cst_lt (initial_def, induc_val))
4573               || (induc_code == MIN_EXPR
4574                   && tree_int_cst_lt (induc_val, initial_def))))
4575         induc_val = initial_def;
4576       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4577       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4578                                                        &adjustment_def);
4579       vec_initial_defs.create (1);
4580       vec_initial_defs.quick_push (vec_initial_def);
4581     }
4582
4583   /* Set phi nodes arguments.  */
4584   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4585     {
4586       tree vec_init_def = vec_initial_defs[i];
4587       tree def = vect_defs[i];
4588       for (j = 0; j < ncopies; j++)
4589         {
4590           if (j != 0)
4591             {
4592               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4593               if (nested_in_vect_loop)
4594                 vec_init_def
4595                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4596                                                     vec_init_def);
4597             }
4598
4599           /* Set the loop-entry arg of the reduction-phi.  */
4600
4601           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4602               == INTEGER_INDUC_COND_REDUCTION)
4603             {
4604               /* Initialise the reduction phi to zero.  This prevents initial
4605                  values of non-zero interferring with the reduction op.  */
4606               gcc_assert (ncopies == 1);
4607               gcc_assert (i == 0);
4608
4609               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4610               tree induc_val_vec
4611                 = build_vector_from_val (vec_init_def_type, induc_val);
4612
4613               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4614                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4615             }
4616           else
4617             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4618                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4619
4620           /* Set the loop-latch arg for the reduction-phi.  */
4621           if (j > 0)
4622             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4623
4624           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4625                        UNKNOWN_LOCATION);
4626
4627           if (dump_enabled_p ())
4628             {
4629               dump_printf_loc (MSG_NOTE, vect_location,
4630                                "transform reduction: created def-use cycle: ");
4631               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4632               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4633             }
4634         }
4635     }
4636
4637   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4638      which is updated with the current index of the loop for every match of
4639      the original loop's cond_expr (VEC_STMT).  This results in a vector
4640      containing the last time the condition passed for that vector lane.
4641      The first match will be a 1 to allow 0 to be used for non-matching
4642      indexes.  If there are no matches at all then the vector will be all
4643      zeroes.  */
4644   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4645     {
4646       tree indx_before_incr, indx_after_incr;
4647       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4648
4649       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4650       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4651
4652       int scalar_precision
4653         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4654       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4655       tree cr_index_vector_type = build_vector_type
4656         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4657
4658       /* First we create a simple vector induction variable which starts
4659          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4660          vector size (STEP).  */
4661
4662       /* Create a {1,2,3,...} vector.  */
4663       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4664
4665       /* Create a vector of the step value.  */
4666       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4667       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4668
4669       /* Create an induction variable.  */
4670       gimple_stmt_iterator incr_gsi;
4671       bool insert_after;
4672       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4673       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4674                  insert_after, &indx_before_incr, &indx_after_incr);
4675
4676       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4677          filled with zeros (VEC_ZERO).  */
4678
4679       /* Create a vector of 0s.  */
4680       tree zero = build_zero_cst (cr_index_scalar_type);
4681       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4682
4683       /* Create a vector phi node.  */
4684       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4685       new_phi = create_phi_node (new_phi_tree, loop->header);
4686       set_vinfo_for_stmt (new_phi,
4687                           new_stmt_vec_info (new_phi, loop_vinfo));
4688       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4689                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4690
4691       /* Now take the condition from the loops original cond_expr
4692          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4693          every match uses values from the induction variable
4694          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4695          (NEW_PHI_TREE).
4696          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4697          the new cond_expr (INDEX_COND_EXPR).  */
4698
4699       /* Duplicate the condition from vec_stmt.  */
4700       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4701
4702       /* Create a conditional, where the condition is taken from vec_stmt
4703          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4704          else is the phi (NEW_PHI_TREE).  */
4705       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4706                                      ccompare, indx_before_incr,
4707                                      new_phi_tree);
4708       induction_index = make_ssa_name (cr_index_vector_type);
4709       gimple *index_condition = gimple_build_assign (induction_index,
4710                                                      index_cond_expr);
4711       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4712       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4713                                                         loop_vinfo);
4714       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4715       set_vinfo_for_stmt (index_condition, index_vec_info);
4716
4717       /* Update the phi with the vec cond.  */
4718       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4719                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4720     }
4721
4722   /* 2. Create epilog code.
4723         The reduction epilog code operates across the elements of the vector
4724         of partial results computed by the vectorized loop.
4725         The reduction epilog code consists of:
4726
4727         step 1: compute the scalar result in a vector (v_out2)
4728         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4729         step 3: adjust the scalar result (s_out3) if needed.
4730
4731         Step 1 can be accomplished using one the following three schemes:
4732           (scheme 1) using reduc_fn, if available.
4733           (scheme 2) using whole-vector shifts, if available.
4734           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4735                      combined.
4736
4737           The overall epilog code looks like this:
4738
4739           s_out0 = phi <s_loop>         # original EXIT_PHI
4740           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4741           v_out2 = reduce <v_out1>              # step 1
4742           s_out3 = extract_field <v_out2, 0>    # step 2
4743           s_out4 = adjust_result <s_out3>       # step 3
4744
4745           (step 3 is optional, and steps 1 and 2 may be combined).
4746           Lastly, the uses of s_out0 are replaced by s_out4.  */
4747
4748
4749   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4750          v_out1 = phi <VECT_DEF>
4751          Store them in NEW_PHIS.  */
4752
4753   exit_bb = single_exit (loop)->dest;
4754   prev_phi_info = NULL;
4755   new_phis.create (vect_defs.length ());
4756   FOR_EACH_VEC_ELT (vect_defs, i, def)
4757     {
4758       for (j = 0; j < ncopies; j++)
4759         {
4760           tree new_def = copy_ssa_name (def);
4761           phi = create_phi_node (new_def, exit_bb);
4762           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4763           if (j == 0)
4764             new_phis.quick_push (phi);
4765           else
4766             {
4767               def = vect_get_vec_def_for_stmt_copy (dt, def);
4768               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4769             }
4770
4771           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4772           prev_phi_info = vinfo_for_stmt (phi);
4773         }
4774     }
4775
4776   /* The epilogue is created for the outer-loop, i.e., for the loop being
4777      vectorized.  Create exit phis for the outer loop.  */
4778   if (double_reduc)
4779     {
4780       loop = outer_loop;
4781       exit_bb = single_exit (loop)->dest;
4782       inner_phis.create (vect_defs.length ());
4783       FOR_EACH_VEC_ELT (new_phis, i, phi)
4784         {
4785           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4786           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4787           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4788                            PHI_RESULT (phi));
4789           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4790                                                             loop_vinfo));
4791           inner_phis.quick_push (phi);
4792           new_phis[i] = outer_phi;
4793           prev_phi_info = vinfo_for_stmt (outer_phi);
4794           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4795             {
4796               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4797               new_result = copy_ssa_name (PHI_RESULT (phi));
4798               outer_phi = create_phi_node (new_result, exit_bb);
4799               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4800                                PHI_RESULT (phi));
4801               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4802                                                                 loop_vinfo));
4803               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4804               prev_phi_info = vinfo_for_stmt (outer_phi);
4805             }
4806         }
4807     }
4808
4809   exit_gsi = gsi_after_labels (exit_bb);
4810
4811   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4812          (i.e. when reduc_fn is not available) and in the final adjustment
4813          code (if needed).  Also get the original scalar reduction variable as
4814          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4815          represents a reduction pattern), the tree-code and scalar-def are
4816          taken from the original stmt that the pattern-stmt (STMT) replaces.
4817          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4818          are taken from STMT.  */
4819
4820   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4821   if (!orig_stmt)
4822     {
4823       /* Regular reduction  */
4824       orig_stmt = stmt;
4825     }
4826   else
4827     {
4828       /* Reduction pattern  */
4829       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4830       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4831       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4832     }
4833
4834   code = gimple_assign_rhs_code (orig_stmt);
4835   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4836      partial results are added and not subtracted.  */
4837   if (code == MINUS_EXPR)
4838     code = PLUS_EXPR;
4839
4840   scalar_dest = gimple_assign_lhs (orig_stmt);
4841   scalar_type = TREE_TYPE (scalar_dest);
4842   scalar_results.create (group_size);
4843   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4844   bitsize = TYPE_SIZE (scalar_type);
4845
4846   /* In case this is a reduction in an inner-loop while vectorizing an outer
4847      loop - we don't need to extract a single scalar result at the end of the
4848      inner-loop (unless it is double reduction, i.e., the use of reduction is
4849      outside the outer-loop).  The final vector of partial results will be used
4850      in the vectorized outer-loop, or reduced to a scalar result at the end of
4851      the outer-loop.  */
4852   if (nested_in_vect_loop && !double_reduc)
4853     goto vect_finalize_reduction;
4854
4855   /* SLP reduction without reduction chain, e.g.,
4856      # a1 = phi <a2, a0>
4857      # b1 = phi <b2, b0>
4858      a2 = operation (a1)
4859      b2 = operation (b1)  */
4860   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4861
4862   /* True if we should implement SLP_REDUC using native reduction operations
4863      instead of scalar operations.  */
4864   direct_slp_reduc = (reduc_fn != IFN_LAST
4865                       && slp_reduc
4866                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4867
4868   /* In case of reduction chain, e.g.,
4869      # a1 = phi <a3, a0>
4870      a2 = operation (a1)
4871      a3 = operation (a2),
4872
4873      we may end up with more than one vector result.  Here we reduce them to
4874      one vector.  */
4875   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4876     {
4877       tree first_vect = PHI_RESULT (new_phis[0]);
4878       gassign *new_vec_stmt = NULL;
4879       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4880       for (k = 1; k < new_phis.length (); k++)
4881         {
4882           gimple *next_phi = new_phis[k];
4883           tree second_vect = PHI_RESULT (next_phi);
4884           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4885           new_vec_stmt = gimple_build_assign (tem, code,
4886                                               first_vect, second_vect);
4887           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4888           first_vect = tem;
4889         }
4890
4891       new_phi_result = first_vect;
4892       if (new_vec_stmt)
4893         {
4894           new_phis.truncate (0);
4895           new_phis.safe_push (new_vec_stmt);
4896         }
4897     }
4898   /* Likewise if we couldn't use a single defuse cycle.  */
4899   else if (ncopies > 1)
4900     {
4901       gcc_assert (new_phis.length () == 1);
4902       tree first_vect = PHI_RESULT (new_phis[0]);
4903       gassign *new_vec_stmt = NULL;
4904       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4905       gimple *next_phi = new_phis[0];
4906       for (int k = 1; k < ncopies; ++k)
4907         {
4908           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4909           tree second_vect = PHI_RESULT (next_phi);
4910           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4911           new_vec_stmt = gimple_build_assign (tem, code,
4912                                               first_vect, second_vect);
4913           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4914           first_vect = tem;
4915         }
4916       new_phi_result = first_vect;
4917       new_phis.truncate (0);
4918       new_phis.safe_push (new_vec_stmt);
4919     }
4920   else
4921     new_phi_result = PHI_RESULT (new_phis[0]);
4922
4923   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4924       && reduc_fn != IFN_LAST)
4925     {
4926       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4927          various data values where the condition matched and another vector
4928          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4929          need to extract the last matching index (which will be the index with
4930          highest value) and use this to index into the data vector.
4931          For the case where there were no matches, the data vector will contain
4932          all default values and the index vector will be all zeros.  */
4933
4934       /* Get various versions of the type of the vector of indexes.  */
4935       tree index_vec_type = TREE_TYPE (induction_index);
4936       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4937       tree index_scalar_type = TREE_TYPE (index_vec_type);
4938       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4939         (index_vec_type);
4940
4941       /* Get an unsigned integer version of the type of the data vector.  */
4942       int scalar_precision
4943         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4944       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4945       tree vectype_unsigned = build_vector_type
4946         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4947
4948       /* First we need to create a vector (ZERO_VEC) of zeros and another
4949          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4950          can create using a MAX reduction and then expanding.
4951          In the case where the loop never made any matches, the max index will
4952          be zero.  */
4953
4954       /* Vector of {0, 0, 0,...}.  */
4955       tree zero_vec = make_ssa_name (vectype);
4956       tree zero_vec_rhs = build_zero_cst (vectype);
4957       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4958       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4959
4960       /* Find maximum value from the vector of found indexes.  */
4961       tree max_index = make_ssa_name (index_scalar_type);
4962       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4963                                                           1, induction_index);
4964       gimple_call_set_lhs (max_index_stmt, max_index);
4965       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4966
4967       /* Vector of {max_index, max_index, max_index,...}.  */
4968       tree max_index_vec = make_ssa_name (index_vec_type);
4969       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4970                                                       max_index);
4971       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4972                                                         max_index_vec_rhs);
4973       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4974
4975       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4976          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4977          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4978          otherwise.  Only one value should match, resulting in a vector
4979          (VEC_COND) with one data value and the rest zeros.
4980          In the case where the loop never made any matches, every index will
4981          match, resulting in a vector with all data values (which will all be
4982          the default value).  */
4983
4984       /* Compare the max index vector to the vector of found indexes to find
4985          the position of the max value.  */
4986       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4987       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4988                                                       induction_index,
4989                                                       max_index_vec);
4990       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4991
4992       /* Use the compare to choose either values from the data vector or
4993          zero.  */
4994       tree vec_cond = make_ssa_name (vectype);
4995       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4996                                                    vec_compare, new_phi_result,
4997                                                    zero_vec);
4998       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4999
5000       /* Finally we need to extract the data value from the vector (VEC_COND)
5001          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5002          reduction, but because this doesn't exist, we can use a MAX reduction
5003          instead.  The data value might be signed or a float so we need to cast
5004          it first.
5005          In the case where the loop never made any matches, the data values are
5006          all identical, and so will reduce down correctly.  */
5007
5008       /* Make the matched data values unsigned.  */
5009       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5010       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5011                                        vec_cond);
5012       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5013                                                         VIEW_CONVERT_EXPR,
5014                                                         vec_cond_cast_rhs);
5015       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5016
5017       /* Reduce down to a scalar value.  */
5018       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5019       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5020                                                            1, vec_cond_cast);
5021       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5022       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5023
5024       /* Convert the reduced value back to the result type and set as the
5025          result.  */
5026       gimple_seq stmts = NULL;
5027       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5028                                data_reduc);
5029       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5030       scalar_results.safe_push (new_temp);
5031     }
5032   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5033            && reduc_fn == IFN_LAST)
5034     {
5035       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5036          idx = 0;
5037          idx_val = induction_index[0];
5038          val = data_reduc[0];
5039          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5040            if (induction_index[i] > idx_val)
5041              val = data_reduc[i], idx_val = induction_index[i];
5042          return val;  */
5043
5044       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5045       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5046       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5047       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5048       /* Enforced by vectorizable_reduction, which ensures we have target
5049          support before allowing a conditional reduction on variable-length
5050          vectors.  */
5051       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5052       tree idx_val = NULL_TREE, val = NULL_TREE;
5053       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5054         {
5055           tree old_idx_val = idx_val;
5056           tree old_val = val;
5057           idx_val = make_ssa_name (idx_eltype);
5058           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5059                                              build3 (BIT_FIELD_REF, idx_eltype,
5060                                                      induction_index,
5061                                                      bitsize_int (el_size),
5062                                                      bitsize_int (off)));
5063           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5064           val = make_ssa_name (data_eltype);
5065           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5066                                              build3 (BIT_FIELD_REF,
5067                                                      data_eltype,
5068                                                      new_phi_result,
5069                                                      bitsize_int (el_size),
5070                                                      bitsize_int (off)));
5071           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5072           if (off != 0)
5073             {
5074               tree new_idx_val = idx_val;
5075               tree new_val = val;
5076               if (off != v_size - el_size)
5077                 {
5078                   new_idx_val = make_ssa_name (idx_eltype);
5079                   epilog_stmt = gimple_build_assign (new_idx_val,
5080                                                      MAX_EXPR, idx_val,
5081                                                      old_idx_val);
5082                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5083                 }
5084               new_val = make_ssa_name (data_eltype);
5085               epilog_stmt = gimple_build_assign (new_val,
5086                                                  COND_EXPR,
5087                                                  build2 (GT_EXPR,
5088                                                          boolean_type_node,
5089                                                          idx_val,
5090                                                          old_idx_val),
5091                                                  val, old_val);
5092               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5093               idx_val = new_idx_val;
5094               val = new_val;
5095             }
5096         }
5097       /* Convert the reduced value back to the result type and set as the
5098          result.  */
5099       gimple_seq stmts = NULL;
5100       val = gimple_convert (&stmts, scalar_type, val);
5101       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5102       scalar_results.safe_push (val);
5103     }
5104
5105   /* 2.3 Create the reduction code, using one of the three schemes described
5106          above. In SLP we simply need to extract all the elements from the
5107          vector (without reducing them), so we use scalar shifts.  */
5108   else if (reduc_fn != IFN_LAST && !slp_reduc)
5109     {
5110       tree tmp;
5111       tree vec_elem_type;
5112
5113       /* Case 1:  Create:
5114          v_out2 = reduc_expr <v_out1>  */
5115
5116       if (dump_enabled_p ())
5117         dump_printf_loc (MSG_NOTE, vect_location,
5118                          "Reduce using direct vector reduction.\n");
5119
5120       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5121       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5122         {
5123           tree tmp_dest
5124             = vect_create_destination_var (scalar_dest, vec_elem_type);
5125           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5126                                                     new_phi_result);
5127           gimple_set_lhs (epilog_stmt, tmp_dest);
5128           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5129           gimple_set_lhs (epilog_stmt, new_temp);
5130           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5131
5132           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5133                                              new_temp);
5134         }
5135       else
5136         {
5137           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5138                                                     new_phi_result);
5139           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5140         }
5141
5142       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5143       gimple_set_lhs (epilog_stmt, new_temp);
5144       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5145
5146       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5147            == INTEGER_INDUC_COND_REDUCTION)
5148           && !operand_equal_p (initial_def, induc_val, 0))
5149         {
5150           /* Earlier we set the initial value to be a vector if induc_val
5151              values.  Check the result and if it is induc_val then replace
5152              with the original initial value, unless induc_val is
5153              the same as initial_def already.  */
5154           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5155                                   induc_val);
5156
5157           tmp = make_ssa_name (new_scalar_dest);
5158           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5159                                              initial_def, new_temp);
5160           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5161           new_temp = tmp;
5162         }
5163
5164       scalar_results.safe_push (new_temp);
5165     }
5166   else if (direct_slp_reduc)
5167     {
5168       /* Here we create one vector for each of the GROUP_SIZE results,
5169          with the elements for other SLP statements replaced with the
5170          neutral value.  We can then do a normal reduction on each vector.  */
5171
5172       /* Enforced by vectorizable_reduction.  */
5173       gcc_assert (new_phis.length () == 1);
5174       gcc_assert (pow2p_hwi (group_size));
5175
5176       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5177       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5178       gimple_seq seq = NULL;
5179
5180       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5181          and the same element size as VECTYPE.  */
5182       tree index = build_index_vector (vectype, 0, 1);
5183       tree index_type = TREE_TYPE (index);
5184       tree index_elt_type = TREE_TYPE (index_type);
5185       tree mask_type = build_same_sized_truth_vector_type (index_type);
5186
5187       /* Create a vector that, for each element, identifies which of
5188          the GROUP_SIZE results should use it.  */
5189       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5190       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5191                             build_vector_from_val (index_type, index_mask));
5192
5193       /* Get a neutral vector value.  This is simply a splat of the neutral
5194          scalar value if we have one, otherwise the initial scalar value
5195          is itself a neutral value.  */
5196       tree vector_identity = NULL_TREE;
5197       if (neutral_op)
5198         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5199                                                         neutral_op);
5200       for (unsigned int i = 0; i < group_size; ++i)
5201         {
5202           /* If there's no univeral neutral value, we can use the
5203              initial scalar value from the original PHI.  This is used
5204              for MIN and MAX reduction, for example.  */
5205           if (!neutral_op)
5206             {
5207               tree scalar_value
5208                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5209                                          loop_preheader_edge (loop));
5210               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5211                                                               scalar_value);
5212             }
5213
5214           /* Calculate the equivalent of:
5215
5216              sel[j] = (index[j] == i);
5217
5218              which selects the elements of NEW_PHI_RESULT that should
5219              be included in the result.  */
5220           tree compare_val = build_int_cst (index_elt_type, i);
5221           compare_val = build_vector_from_val (index_type, compare_val);
5222           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5223                                    index, compare_val);
5224
5225           /* Calculate the equivalent of:
5226
5227              vec = seq ? new_phi_result : vector_identity;
5228
5229              VEC is now suitable for a full vector reduction.  */
5230           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5231                                    sel, new_phi_result, vector_identity);
5232
5233           /* Do the reduction and convert it to the appropriate type.  */
5234           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5235                                       TREE_TYPE (vectype), vec);
5236           scalar = gimple_convert (&seq, scalar_type, scalar);
5237           scalar_results.safe_push (scalar);
5238         }
5239       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5240     }
5241   else
5242     {
5243       bool reduce_with_shift;
5244       tree vec_temp;
5245
5246       /* COND reductions all do the final reduction with MAX_EXPR
5247          or MIN_EXPR.  */
5248       if (code == COND_EXPR)
5249         {
5250           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5251               == INTEGER_INDUC_COND_REDUCTION)
5252             code = induc_code;
5253           else
5254             code = MAX_EXPR;
5255         }
5256
5257       /* See if the target wants to do the final (shift) reduction
5258          in a vector mode of smaller size and first reduce upper/lower
5259          halves against each other.  */
5260       enum machine_mode mode1 = mode;
5261       tree vectype1 = vectype;
5262       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5263       unsigned sz1 = sz;
5264       if (!slp_reduc
5265           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5266         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5267
5268       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5269       reduce_with_shift = have_whole_vector_shift (mode1);
5270       if (!VECTOR_MODE_P (mode1))
5271         reduce_with_shift = false;
5272       else
5273         {
5274           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5275           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5276             reduce_with_shift = false;
5277         }
5278
5279       /* First reduce the vector to the desired vector size we should
5280          do shift reduction on by combining upper and lower halves.  */
5281       new_temp = new_phi_result;
5282       while (sz > sz1)
5283         {
5284           gcc_assert (!slp_reduc);
5285           sz /= 2;
5286           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5287
5288           /* The target has to make sure we support lowpart/highpart
5289              extraction, either via direct vector extract or through
5290              an integer mode punning.  */
5291           tree dst1, dst2;
5292           if (convert_optab_handler (vec_extract_optab,
5293                                      TYPE_MODE (TREE_TYPE (new_temp)),
5294                                      TYPE_MODE (vectype1))
5295               != CODE_FOR_nothing)
5296             {
5297               /* Extract sub-vectors directly once vec_extract becomes
5298                  a conversion optab.  */
5299               dst1 = make_ssa_name (vectype1);
5300               epilog_stmt
5301                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5302                                          build3 (BIT_FIELD_REF, vectype1,
5303                                                  new_temp, TYPE_SIZE (vectype1),
5304                                                  bitsize_int (0)));
5305               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5306               dst2 =  make_ssa_name (vectype1);
5307               epilog_stmt
5308                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5309                                          build3 (BIT_FIELD_REF, vectype1,
5310                                                  new_temp, TYPE_SIZE (vectype1),
5311                                                  bitsize_int (sz * BITS_PER_UNIT)));
5312               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5313             }
5314           else
5315             {
5316               /* Extract via punning to appropriately sized integer mode
5317                  vector.  */
5318               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5319                                                             1);
5320               tree etype = build_vector_type (eltype, 2);
5321               gcc_assert (convert_optab_handler (vec_extract_optab,
5322                                                  TYPE_MODE (etype),
5323                                                  TYPE_MODE (eltype))
5324                           != CODE_FOR_nothing);
5325               tree tem = make_ssa_name (etype);
5326               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5327                                                  build1 (VIEW_CONVERT_EXPR,
5328                                                          etype, new_temp));
5329               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5330               new_temp = tem;
5331               tem = make_ssa_name (eltype);
5332               epilog_stmt
5333                   = gimple_build_assign (tem, BIT_FIELD_REF,
5334                                          build3 (BIT_FIELD_REF, eltype,
5335                                                  new_temp, TYPE_SIZE (eltype),
5336                                                  bitsize_int (0)));
5337               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5338               dst1 = make_ssa_name (vectype1);
5339               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5340                                                  build1 (VIEW_CONVERT_EXPR,
5341                                                          vectype1, tem));
5342               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5343               tem = make_ssa_name (eltype);
5344               epilog_stmt
5345                   = gimple_build_assign (tem, BIT_FIELD_REF,
5346                                          build3 (BIT_FIELD_REF, eltype,
5347                                                  new_temp, TYPE_SIZE (eltype),
5348                                                  bitsize_int (sz * BITS_PER_UNIT)));
5349               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5350               dst2 =  make_ssa_name (vectype1);
5351               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5352                                                  build1 (VIEW_CONVERT_EXPR,
5353                                                          vectype1, tem));
5354               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5355             }
5356
5357           new_temp = make_ssa_name (vectype1);
5358           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5359           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5360         }
5361
5362       if (reduce_with_shift && !slp_reduc)
5363         {
5364           int element_bitsize = tree_to_uhwi (bitsize);
5365           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5366              for variable-length vectors and also requires direct target support
5367              for loop reductions.  */
5368           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5369           int nelements = vec_size_in_bits / element_bitsize;
5370           vec_perm_builder sel;
5371           vec_perm_indices indices;
5372
5373           int elt_offset;
5374
5375           tree zero_vec = build_zero_cst (vectype1);
5376           /* Case 2: Create:
5377              for (offset = nelements/2; offset >= 1; offset/=2)
5378                 {
5379                   Create:  va' = vec_shift <va, offset>
5380                   Create:  va = vop <va, va'>
5381                 }  */
5382
5383           tree rhs;
5384
5385           if (dump_enabled_p ())
5386             dump_printf_loc (MSG_NOTE, vect_location,
5387                              "Reduce using vector shifts\n");
5388
5389           mode1 = TYPE_MODE (vectype1);
5390           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5391           for (elt_offset = nelements / 2;
5392                elt_offset >= 1;
5393                elt_offset /= 2)
5394             {
5395               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5396               indices.new_vector (sel, 2, nelements);
5397               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5398               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5399                                                  new_temp, zero_vec, mask);
5400               new_name = make_ssa_name (vec_dest, epilog_stmt);
5401               gimple_assign_set_lhs (epilog_stmt, new_name);
5402               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5403
5404               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5405                                                  new_temp);
5406               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5407               gimple_assign_set_lhs (epilog_stmt, new_temp);
5408               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5409             }
5410
5411           /* 2.4  Extract the final scalar result.  Create:
5412              s_out3 = extract_field <v_out2, bitpos>  */
5413
5414           if (dump_enabled_p ())
5415             dump_printf_loc (MSG_NOTE, vect_location,
5416                              "extract scalar result\n");
5417
5418           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5419                         bitsize, bitsize_zero_node);
5420           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5421           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5422           gimple_assign_set_lhs (epilog_stmt, new_temp);
5423           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5424           scalar_results.safe_push (new_temp);
5425         }
5426       else
5427         {
5428           /* Case 3: Create:
5429              s = extract_field <v_out2, 0>
5430              for (offset = element_size;
5431                   offset < vector_size;
5432                   offset += element_size;)
5433                {
5434                  Create:  s' = extract_field <v_out2, offset>
5435                  Create:  s = op <s, s'>  // For non SLP cases
5436                }  */
5437
5438           if (dump_enabled_p ())
5439             dump_printf_loc (MSG_NOTE, vect_location,
5440                              "Reduce using scalar code.\n");
5441
5442           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5443           int element_bitsize = tree_to_uhwi (bitsize);
5444           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5445             {
5446               int bit_offset;
5447               if (gimple_code (new_phi) == GIMPLE_PHI)
5448                 vec_temp = PHI_RESULT (new_phi);
5449               else
5450                 vec_temp = gimple_assign_lhs (new_phi);
5451               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5452                                  bitsize_zero_node);
5453               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5454               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5455               gimple_assign_set_lhs (epilog_stmt, new_temp);
5456               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5457
5458               /* In SLP we don't need to apply reduction operation, so we just
5459                  collect s' values in SCALAR_RESULTS.  */
5460               if (slp_reduc)
5461                 scalar_results.safe_push (new_temp);
5462
5463               for (bit_offset = element_bitsize;
5464                    bit_offset < vec_size_in_bits;
5465                    bit_offset += element_bitsize)
5466                 {
5467                   tree bitpos = bitsize_int (bit_offset);
5468                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5469                                      bitsize, bitpos);
5470
5471                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5472                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5473                   gimple_assign_set_lhs (epilog_stmt, new_name);
5474                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5475
5476                   if (slp_reduc)
5477                     {
5478                       /* In SLP we don't need to apply reduction operation, so
5479                          we just collect s' values in SCALAR_RESULTS.  */
5480                       new_temp = new_name;
5481                       scalar_results.safe_push (new_name);
5482                     }
5483                   else
5484                     {
5485                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5486                                                          new_name, new_temp);
5487                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5488                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5489                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5490                     }
5491                 }
5492             }
5493
5494           /* The only case where we need to reduce scalar results in SLP, is
5495              unrolling.  If the size of SCALAR_RESULTS is greater than
5496              GROUP_SIZE, we reduce them combining elements modulo
5497              GROUP_SIZE.  */
5498           if (slp_reduc)
5499             {
5500               tree res, first_res, new_res;
5501               gimple *new_stmt;
5502
5503               /* Reduce multiple scalar results in case of SLP unrolling.  */
5504               for (j = group_size; scalar_results.iterate (j, &res);
5505                    j++)
5506                 {
5507                   first_res = scalar_results[j % group_size];
5508                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5509                                                   first_res, res);
5510                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5511                   gimple_assign_set_lhs (new_stmt, new_res);
5512                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5513                   scalar_results[j % group_size] = new_res;
5514                 }
5515             }
5516           else
5517             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5518             scalar_results.safe_push (new_temp);
5519         }
5520
5521       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5522            == INTEGER_INDUC_COND_REDUCTION)
5523           && !operand_equal_p (initial_def, induc_val, 0))
5524         {
5525           /* Earlier we set the initial value to be a vector if induc_val
5526              values.  Check the result and if it is induc_val then replace
5527              with the original initial value, unless induc_val is
5528              the same as initial_def already.  */
5529           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5530                                   induc_val);
5531
5532           tree tmp = make_ssa_name (new_scalar_dest);
5533           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5534                                              initial_def, new_temp);
5535           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5536           scalar_results[0] = tmp;
5537         }
5538     }
5539
5540 vect_finalize_reduction:
5541
5542   if (double_reduc)
5543     loop = loop->inner;
5544
5545   /* 2.5 Adjust the final result by the initial value of the reduction
5546          variable. (When such adjustment is not needed, then
5547          'adjustment_def' is zero).  For example, if code is PLUS we create:
5548          new_temp = loop_exit_def + adjustment_def  */
5549
5550   if (adjustment_def)
5551     {
5552       gcc_assert (!slp_reduc);
5553       if (nested_in_vect_loop)
5554         {
5555           new_phi = new_phis[0];
5556           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5557           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5558           new_dest = vect_create_destination_var (scalar_dest, vectype);
5559         }
5560       else
5561         {
5562           new_temp = scalar_results[0];
5563           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5564           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5565           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5566         }
5567
5568       epilog_stmt = gimple_build_assign (new_dest, expr);
5569       new_temp = make_ssa_name (new_dest, epilog_stmt);
5570       gimple_assign_set_lhs (epilog_stmt, new_temp);
5571       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5572       if (nested_in_vect_loop)
5573         {
5574           set_vinfo_for_stmt (epilog_stmt,
5575                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5576           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5577                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5578
5579           if (!double_reduc)
5580             scalar_results.quick_push (new_temp);
5581           else
5582             scalar_results[0] = new_temp;
5583         }
5584       else
5585         scalar_results[0] = new_temp;
5586
5587       new_phis[0] = epilog_stmt;
5588     }
5589
5590   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5591           phis with new adjusted scalar results, i.e., replace use <s_out0>
5592           with use <s_out4>.
5593
5594      Transform:
5595         loop_exit:
5596           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5597           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5598           v_out2 = reduce <v_out1>
5599           s_out3 = extract_field <v_out2, 0>
5600           s_out4 = adjust_result <s_out3>
5601           use <s_out0>
5602           use <s_out0>
5603
5604      into:
5605
5606         loop_exit:
5607           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5608           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5609           v_out2 = reduce <v_out1>
5610           s_out3 = extract_field <v_out2, 0>
5611           s_out4 = adjust_result <s_out3>
5612           use <s_out4>
5613           use <s_out4> */
5614
5615
5616   /* In SLP reduction chain we reduce vector results into one vector if
5617      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5618      the last stmt in the reduction chain, since we are looking for the loop
5619      exit phi node.  */
5620   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5621     {
5622       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5623       /* Handle reduction patterns.  */
5624       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5625         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5626
5627       scalar_dest = gimple_assign_lhs (dest_stmt);
5628       group_size = 1;
5629     }
5630
5631   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5632      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5633      need to match SCALAR_RESULTS with corresponding statements.  The first
5634      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5635      the first vector stmt, etc.
5636      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5637   if (group_size > new_phis.length ())
5638     {
5639       ratio = group_size / new_phis.length ();
5640       gcc_assert (!(group_size % new_phis.length ()));
5641     }
5642   else
5643     ratio = 1;
5644
5645   for (k = 0; k < group_size; k++)
5646     {
5647       if (k % ratio == 0)
5648         {
5649           epilog_stmt = new_phis[k / ratio];
5650           reduction_phi = reduction_phis[k / ratio];
5651           if (double_reduc)
5652             inner_phi = inner_phis[k / ratio];
5653         }
5654
5655       if (slp_reduc)
5656         {
5657           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5658
5659           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5660           /* SLP statements can't participate in patterns.  */
5661           gcc_assert (!orig_stmt);
5662           scalar_dest = gimple_assign_lhs (current_stmt);
5663         }
5664
5665       phis.create (3);
5666       /* Find the loop-closed-use at the loop exit of the original scalar
5667          result.  (The reduction result is expected to have two immediate uses -
5668          one at the latch block, and one at the loop exit).  */
5669       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5670         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5671             && !is_gimple_debug (USE_STMT (use_p)))
5672           phis.safe_push (USE_STMT (use_p));
5673
5674       /* While we expect to have found an exit_phi because of loop-closed-ssa
5675          form we can end up without one if the scalar cycle is dead.  */
5676
5677       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5678         {
5679           if (outer_loop)
5680             {
5681               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5682               gphi *vect_phi;
5683
5684               /* FORNOW. Currently not supporting the case that an inner-loop
5685                  reduction is not used in the outer-loop (but only outside the
5686                  outer-loop), unless it is double reduction.  */
5687               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5688                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5689                           || double_reduc);
5690
5691               if (double_reduc)
5692                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5693               else
5694                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5695               if (!double_reduc
5696                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5697                       != vect_double_reduction_def)
5698                 continue;
5699
5700               /* Handle double reduction:
5701
5702                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5703                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5704                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5705                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5706
5707                  At that point the regular reduction (stmt2 and stmt3) is
5708                  already vectorized, as well as the exit phi node, stmt4.
5709                  Here we vectorize the phi node of double reduction, stmt1, and
5710                  update all relevant statements.  */
5711
5712               /* Go through all the uses of s2 to find double reduction phi
5713                  node, i.e., stmt1 above.  */
5714               orig_name = PHI_RESULT (exit_phi);
5715               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5716                 {
5717                   stmt_vec_info use_stmt_vinfo;
5718                   stmt_vec_info new_phi_vinfo;
5719                   tree vect_phi_init, preheader_arg, vect_phi_res;
5720                   basic_block bb = gimple_bb (use_stmt);
5721                   gimple *use;
5722
5723                   /* Check that USE_STMT is really double reduction phi
5724                      node.  */
5725                   if (gimple_code (use_stmt) != GIMPLE_PHI
5726                       || gimple_phi_num_args (use_stmt) != 2
5727                       || bb->loop_father != outer_loop)
5728                     continue;
5729                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5730                   if (!use_stmt_vinfo
5731                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5732                           != vect_double_reduction_def)
5733                     continue;
5734
5735                   /* Create vector phi node for double reduction:
5736                      vs1 = phi <vs0, vs2>
5737                      vs1 was created previously in this function by a call to
5738                        vect_get_vec_def_for_operand and is stored in
5739                        vec_initial_def;
5740                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5741                      vs0 is created here.  */
5742
5743                   /* Create vector phi node.  */
5744                   vect_phi = create_phi_node (vec_initial_def, bb);
5745                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5746                                     loop_vec_info_for_loop (outer_loop));
5747                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5748
5749                   /* Create vs0 - initial def of the double reduction phi.  */
5750                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5751                                              loop_preheader_edge (outer_loop));
5752                   vect_phi_init = get_initial_def_for_reduction
5753                     (stmt, preheader_arg, NULL);
5754
5755                   /* Update phi node arguments with vs0 and vs2.  */
5756                   add_phi_arg (vect_phi, vect_phi_init,
5757                                loop_preheader_edge (outer_loop),
5758                                UNKNOWN_LOCATION);
5759                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5760                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5761                   if (dump_enabled_p ())
5762                     {
5763                       dump_printf_loc (MSG_NOTE, vect_location,
5764                                        "created double reduction phi node: ");
5765                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5766                     }
5767
5768                   vect_phi_res = PHI_RESULT (vect_phi);
5769
5770                   /* Replace the use, i.e., set the correct vs1 in the regular
5771                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5772                      loop is redundant.  */
5773                   use = reduction_phi;
5774                   for (j = 0; j < ncopies; j++)
5775                     {
5776                       edge pr_edge = loop_preheader_edge (loop);
5777                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5778                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5779                     }
5780                 }
5781             }
5782         }
5783
5784       phis.release ();
5785       if (nested_in_vect_loop)
5786         {
5787           if (double_reduc)
5788             loop = outer_loop;
5789           else
5790             continue;
5791         }
5792
5793       phis.create (3);
5794       /* Find the loop-closed-use at the loop exit of the original scalar
5795          result.  (The reduction result is expected to have two immediate uses,
5796          one at the latch block, and one at the loop exit).  For double
5797          reductions we are looking for exit phis of the outer loop.  */
5798       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5799         {
5800           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5801             {
5802               if (!is_gimple_debug (USE_STMT (use_p)))
5803                 phis.safe_push (USE_STMT (use_p));
5804             }
5805           else
5806             {
5807               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5808                 {
5809                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5810
5811                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5812                     {
5813                       if (!flow_bb_inside_loop_p (loop,
5814                                              gimple_bb (USE_STMT (phi_use_p)))
5815                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5816                         phis.safe_push (USE_STMT (phi_use_p));
5817                     }
5818                 }
5819             }
5820         }
5821
5822       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5823         {
5824           /* Replace the uses:  */
5825           orig_name = PHI_RESULT (exit_phi);
5826           scalar_result = scalar_results[k];
5827           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5828             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5829               SET_USE (use_p, scalar_result);
5830         }
5831
5832       phis.release ();
5833     }
5834 }
5835
5836 /* Return a vector of type VECTYPE that is equal to the vector select
5837    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5838    before GSI.  */
5839
5840 static tree
5841 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5842                      tree vec, tree identity)
5843 {
5844   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5845   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5846                                           mask, vec, identity);
5847   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5848   return cond;
5849 }
5850
5851 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5852    order, starting with LHS.  Insert the extraction statements before GSI and
5853    associate the new scalar SSA names with variable SCALAR_DEST.
5854    Return the SSA name for the result.  */
5855
5856 static tree
5857 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5858                        tree_code code, tree lhs, tree vector_rhs)
5859 {
5860   tree vectype = TREE_TYPE (vector_rhs);
5861   tree scalar_type = TREE_TYPE (vectype);
5862   tree bitsize = TYPE_SIZE (scalar_type);
5863   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5864   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5865
5866   for (unsigned HOST_WIDE_INT bit_offset = 0;
5867        bit_offset < vec_size_in_bits;
5868        bit_offset += element_bitsize)
5869     {
5870       tree bitpos = bitsize_int (bit_offset);
5871       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5872                          bitsize, bitpos);
5873
5874       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5875       rhs = make_ssa_name (scalar_dest, stmt);
5876       gimple_assign_set_lhs (stmt, rhs);
5877       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5878
5879       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5880       tree new_name = make_ssa_name (scalar_dest, stmt);
5881       gimple_assign_set_lhs (stmt, new_name);
5882       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5883       lhs = new_name;
5884     }
5885   return lhs;
5886 }
5887
5888 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5889    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5890    statement.  CODE is the operation performed by STMT and OPS are
5891    its scalar operands.  REDUC_INDEX is the index of the operand in
5892    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5893    implements in-order reduction, or IFN_LAST if we should open-code it.
5894    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5895    that should be used to control the operation in a fully-masked loop.  */
5896
5897 static bool
5898 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5899                                gimple **vec_stmt, slp_tree slp_node,
5900                                gimple *reduc_def_stmt,
5901                                tree_code code, internal_fn reduc_fn,
5902                                tree ops[3], tree vectype_in,
5903                                int reduc_index, vec_loop_masks *masks)
5904 {
5905   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5906   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5907   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5908   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5909   gimple *new_stmt = NULL;
5910
5911   int ncopies;
5912   if (slp_node)
5913     ncopies = 1;
5914   else
5915     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5916
5917   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5918   gcc_assert (ncopies == 1);
5919   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5920   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5921   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5922               == FOLD_LEFT_REDUCTION);
5923
5924   if (slp_node)
5925     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5926                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5927
5928   tree op0 = ops[1 - reduc_index];
5929
5930   int group_size = 1;
5931   gimple *scalar_dest_def;
5932   auto_vec<tree> vec_oprnds0;
5933   if (slp_node)
5934     {
5935       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5936       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5937       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5938     }
5939   else
5940     {
5941       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5942       vec_oprnds0.create (1);
5943       vec_oprnds0.quick_push (loop_vec_def0);
5944       scalar_dest_def = stmt;
5945     }
5946
5947   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5948   tree scalar_type = TREE_TYPE (scalar_dest);
5949   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5950
5951   int vec_num = vec_oprnds0.length ();
5952   gcc_assert (vec_num == 1 || slp_node);
5953   tree vec_elem_type = TREE_TYPE (vectype_out);
5954   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5955
5956   tree vector_identity = NULL_TREE;
5957   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5958     vector_identity = build_zero_cst (vectype_out);
5959
5960   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5961   int i;
5962   tree def0;
5963   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5964     {
5965       tree mask = NULL_TREE;
5966       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5967         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5968
5969       /* Handle MINUS by adding the negative.  */
5970       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5971         {
5972           tree negated = make_ssa_name (vectype_out);
5973           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5974           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5975           def0 = negated;
5976         }
5977
5978       if (mask)
5979         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5980                                     vector_identity);
5981
5982       /* On the first iteration the input is simply the scalar phi
5983          result, and for subsequent iterations it is the output of
5984          the preceding operation.  */
5985       if (reduc_fn != IFN_LAST)
5986         {
5987           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5988           /* For chained SLP reductions the output of the previous reduction
5989              operation serves as the input of the next. For the final statement
5990              the output cannot be a temporary - we reuse the original
5991              scalar destination of the last statement.  */
5992           if (i != vec_num - 1)
5993             {
5994               gimple_set_lhs (new_stmt, scalar_dest_var);
5995               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5996               gimple_set_lhs (new_stmt, reduc_var);
5997             }
5998         }
5999       else
6000         {
6001           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6002                                              reduc_var, def0);
6003           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6004           /* Remove the statement, so that we can use the same code paths
6005              as for statements that we've just created.  */
6006           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6007           gsi_remove (&tmp_gsi, false);
6008         }
6009
6010       if (i == vec_num - 1)
6011         {
6012           gimple_set_lhs (new_stmt, scalar_dest);
6013           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6014         }
6015       else
6016         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6017
6018       if (slp_node)
6019         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6020     }
6021
6022   if (!slp_node)
6023     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6024
6025   return true;
6026 }
6027
6028 /* Function is_nonwrapping_integer_induction.
6029
6030    Check if STMT (which is part of loop LOOP) both increments and
6031    does not cause overflow.  */
6032
6033 static bool
6034 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6035 {
6036   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6037   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6038   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6039   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6040   widest_int ni, max_loop_value, lhs_max;
6041   bool overflow = false;
6042
6043   /* Make sure the loop is integer based.  */
6044   if (TREE_CODE (base) != INTEGER_CST
6045       || TREE_CODE (step) != INTEGER_CST)
6046     return false;
6047
6048   /* Check that the max size of the loop will not wrap.  */
6049
6050   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6051     return true;
6052
6053   if (! max_stmt_executions (loop, &ni))
6054     return false;
6055
6056   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6057                             &overflow);
6058   if (overflow)
6059     return false;
6060
6061   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6062                             TYPE_SIGN (lhs_type), &overflow);
6063   if (overflow)
6064     return false;
6065
6066   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6067           <= TYPE_PRECISION (lhs_type));
6068 }
6069
6070 /* Function vectorizable_reduction.
6071
6072    Check if STMT performs a reduction operation that can be vectorized.
6073    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6074    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6075    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6076
6077    This function also handles reduction idioms (patterns) that have been
6078    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6079    of this form:
6080      X = pattern_expr (arg0, arg1, ..., X)
6081    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6082    sequence that had been detected and replaced by the pattern-stmt (STMT).
6083
6084    This function also handles reduction of condition expressions, for example:
6085      for (int i = 0; i < N; i++)
6086        if (a[i] < value)
6087          last = a[i];
6088    This is handled by vectorising the loop and creating an additional vector
6089    containing the loop indexes for which "a[i] < value" was true.  In the
6090    function epilogue this is reduced to a single max value and then used to
6091    index into the vector of results.
6092
6093    In some cases of reduction patterns, the type of the reduction variable X is
6094    different than the type of the other arguments of STMT.
6095    In such cases, the vectype that is used when transforming STMT into a vector
6096    stmt is different than the vectype that is used to determine the
6097    vectorization factor, because it consists of a different number of elements
6098    than the actual number of elements that are being operated upon in parallel.
6099
6100    For example, consider an accumulation of shorts into an int accumulator.
6101    On some targets it's possible to vectorize this pattern operating on 8
6102    shorts at a time (hence, the vectype for purposes of determining the
6103    vectorization factor should be V8HI); on the other hand, the vectype that
6104    is used to create the vector form is actually V4SI (the type of the result).
6105
6106    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6107    indicates what is the actual level of parallelism (V8HI in the example), so
6108    that the right vectorization factor would be derived.  This vectype
6109    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6110    be used to create the vectorized stmt.  The right vectype for the vectorized
6111    stmt is obtained from the type of the result X:
6112         get_vectype_for_scalar_type (TREE_TYPE (X))
6113
6114    This means that, contrary to "regular" reductions (or "regular" stmts in
6115    general), the following equation:
6116       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6117    does *NOT* necessarily hold for reduction patterns.  */
6118
6119 bool
6120 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6121                         gimple **vec_stmt, slp_tree slp_node,
6122                         slp_instance slp_node_instance,
6123                         stmt_vector_for_cost *cost_vec)
6124 {
6125   tree vec_dest;
6126   tree scalar_dest;
6127   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6128   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6129   tree vectype_in = NULL_TREE;
6130   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6131   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6132   enum tree_code code, orig_code;
6133   internal_fn reduc_fn;
6134   machine_mode vec_mode;
6135   int op_type;
6136   optab optab;
6137   tree new_temp = NULL_TREE;
6138   gimple *def_stmt;
6139   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6140   gimple *cond_reduc_def_stmt = NULL;
6141   enum tree_code cond_reduc_op_code = ERROR_MARK;
6142   tree scalar_type;
6143   bool is_simple_use;
6144   gimple *orig_stmt;
6145   stmt_vec_info orig_stmt_info = NULL;
6146   int i;
6147   int ncopies;
6148   int epilog_copies;
6149   stmt_vec_info prev_stmt_info, prev_phi_info;
6150   bool single_defuse_cycle = false;
6151   gimple *new_stmt = NULL;
6152   int j;
6153   tree ops[3];
6154   enum vect_def_type dts[3];
6155   bool nested_cycle = false, found_nested_cycle_def = false;
6156   bool double_reduc = false;
6157   basic_block def_bb;
6158   struct loop * def_stmt_loop, *outer_loop = NULL;
6159   tree def_arg;
6160   gimple *def_arg_stmt;
6161   auto_vec<tree> vec_oprnds0;
6162   auto_vec<tree> vec_oprnds1;
6163   auto_vec<tree> vec_oprnds2;
6164   auto_vec<tree> vect_defs;
6165   auto_vec<gimple *> phis;
6166   int vec_num;
6167   tree def0, tem;
6168   bool first_p = true;
6169   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6170   tree cond_reduc_val = NULL_TREE;
6171
6172   /* Make sure it was already recognized as a reduction computation.  */
6173   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6174       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6175     return false;
6176
6177   if (nested_in_vect_loop_p (loop, stmt))
6178     {
6179       outer_loop = loop;
6180       loop = loop->inner;
6181       nested_cycle = true;
6182     }
6183
6184   /* In case of reduction chain we switch to the first stmt in the chain, but
6185      we don't update STMT_INFO, since only the last stmt is marked as reduction
6186      and has reduction properties.  */
6187   if (GROUP_FIRST_ELEMENT (stmt_info)
6188       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6189     {
6190       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6191       first_p = false;
6192     }
6193
6194   if (gimple_code (stmt) == GIMPLE_PHI)
6195     {
6196       /* Analysis is fully done on the reduction stmt invocation.  */
6197       if (! vec_stmt)
6198         {
6199           if (slp_node)
6200             slp_node_instance->reduc_phis = slp_node;
6201
6202           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6203           return true;
6204         }
6205
6206       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6207         /* Leave the scalar phi in place.  Note that checking
6208            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6209            for reductions involving a single statement.  */
6210         return true;
6211
6212       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6213       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6214         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6215
6216       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6217           == EXTRACT_LAST_REDUCTION)
6218         /* Leave the scalar phi in place.  */
6219         return true;
6220
6221       gcc_assert (is_gimple_assign (reduc_stmt));
6222       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6223         {
6224           tree op = gimple_op (reduc_stmt, k);
6225           if (op == gimple_phi_result (stmt))
6226             continue;
6227           if (k == 1
6228               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6229             continue;
6230           if (!vectype_in
6231               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6232                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6233             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6234           break;
6235         }
6236       gcc_assert (vectype_in);
6237
6238       if (slp_node)
6239         ncopies = 1;
6240       else
6241         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6242
6243       use_operand_p use_p;
6244       gimple *use_stmt;
6245       if (ncopies > 1
6246           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6247               <= vect_used_only_live)
6248           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6249           && (use_stmt == reduc_stmt
6250               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6251                   == reduc_stmt)))
6252         single_defuse_cycle = true;
6253
6254       /* Create the destination vector  */
6255       scalar_dest = gimple_assign_lhs (reduc_stmt);
6256       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6257
6258       if (slp_node)
6259         /* The size vect_schedule_slp_instance computes is off for us.  */
6260         vec_num = vect_get_num_vectors
6261           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6262            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6263            vectype_in);
6264       else
6265         vec_num = 1;
6266
6267       /* Generate the reduction PHIs upfront.  */
6268       prev_phi_info = NULL;
6269       for (j = 0; j < ncopies; j++)
6270         {
6271           if (j == 0 || !single_defuse_cycle)
6272             {
6273               for (i = 0; i < vec_num; i++)
6274                 {
6275                   /* Create the reduction-phi that defines the reduction
6276                      operand.  */
6277                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6278                   set_vinfo_for_stmt (new_phi,
6279                                       new_stmt_vec_info (new_phi, loop_vinfo));
6280
6281                   if (slp_node)
6282                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6283                   else
6284                     {
6285                       if (j == 0)
6286                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6287                       else
6288                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6289                       prev_phi_info = vinfo_for_stmt (new_phi);
6290                     }
6291                 }
6292             }
6293         }
6294
6295       return true;
6296     }
6297
6298   /* 1. Is vectorizable reduction?  */
6299   /* Not supportable if the reduction variable is used in the loop, unless
6300      it's a reduction chain.  */
6301   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6302       && !GROUP_FIRST_ELEMENT (stmt_info))
6303     return false;
6304
6305   /* Reductions that are not used even in an enclosing outer-loop,
6306      are expected to be "live" (used out of the loop).  */
6307   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6308       && !STMT_VINFO_LIVE_P (stmt_info))
6309     return false;
6310
6311   /* 2. Has this been recognized as a reduction pattern?
6312
6313      Check if STMT represents a pattern that has been recognized
6314      in earlier analysis stages.  For stmts that represent a pattern,
6315      the STMT_VINFO_RELATED_STMT field records the last stmt in
6316      the original sequence that constitutes the pattern.  */
6317
6318   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6319   if (orig_stmt)
6320     {
6321       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6322       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6323       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6324     }
6325
6326   /* 3. Check the operands of the operation.  The first operands are defined
6327         inside the loop body. The last operand is the reduction variable,
6328         which is defined by the loop-header-phi.  */
6329
6330   gcc_assert (is_gimple_assign (stmt));
6331
6332   /* Flatten RHS.  */
6333   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6334     {
6335     case GIMPLE_BINARY_RHS:
6336       code = gimple_assign_rhs_code (stmt);
6337       op_type = TREE_CODE_LENGTH (code);
6338       gcc_assert (op_type == binary_op);
6339       ops[0] = gimple_assign_rhs1 (stmt);
6340       ops[1] = gimple_assign_rhs2 (stmt);
6341       break;
6342
6343     case GIMPLE_TERNARY_RHS:
6344       code = gimple_assign_rhs_code (stmt);
6345       op_type = TREE_CODE_LENGTH (code);
6346       gcc_assert (op_type == ternary_op);
6347       ops[0] = gimple_assign_rhs1 (stmt);
6348       ops[1] = gimple_assign_rhs2 (stmt);
6349       ops[2] = gimple_assign_rhs3 (stmt);
6350       break;
6351
6352     case GIMPLE_UNARY_RHS:
6353       return false;
6354
6355     default:
6356       gcc_unreachable ();
6357     }
6358
6359   if (code == COND_EXPR && slp_node)
6360     return false;
6361
6362   scalar_dest = gimple_assign_lhs (stmt);
6363   scalar_type = TREE_TYPE (scalar_dest);
6364   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6365       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6366     return false;
6367
6368   /* Do not try to vectorize bit-precision reductions.  */
6369   if (!type_has_mode_precision_p (scalar_type))
6370     return false;
6371
6372   /* All uses but the last are expected to be defined in the loop.
6373      The last use is the reduction variable.  In case of nested cycle this
6374      assumption is not true: we use reduc_index to record the index of the
6375      reduction variable.  */
6376   gimple *reduc_def_stmt = NULL;
6377   int reduc_index = -1;
6378   for (i = 0; i < op_type; i++)
6379     {
6380       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6381       if (i == 0 && code == COND_EXPR)
6382         continue;
6383
6384       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6385                                           &def_stmt, &dts[i], &tem);
6386       dt = dts[i];
6387       gcc_assert (is_simple_use);
6388       if (dt == vect_reduction_def)
6389         {
6390           reduc_def_stmt = def_stmt;
6391           reduc_index = i;
6392           continue;
6393         }
6394       else if (tem)
6395         {
6396           /* To properly compute ncopies we are interested in the widest
6397              input type in case we're looking at a widening accumulation.  */
6398           if (!vectype_in
6399               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6400                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6401             vectype_in = tem;
6402         }
6403
6404       if (dt != vect_internal_def
6405           && dt != vect_external_def
6406           && dt != vect_constant_def
6407           && dt != vect_induction_def
6408           && !(dt == vect_nested_cycle && nested_cycle))
6409         return false;
6410
6411       if (dt == vect_nested_cycle)
6412         {
6413           found_nested_cycle_def = true;
6414           reduc_def_stmt = def_stmt;
6415           reduc_index = i;
6416         }
6417
6418       if (i == 1 && code == COND_EXPR)
6419         {
6420           /* Record how value of COND_EXPR is defined.  */
6421           if (dt == vect_constant_def)
6422             {
6423               cond_reduc_dt = dt;
6424               cond_reduc_val = ops[i];
6425             }
6426           if (dt == vect_induction_def
6427               && def_stmt != NULL
6428               && is_nonwrapping_integer_induction (def_stmt, loop))
6429             {
6430               cond_reduc_dt = dt;
6431               cond_reduc_def_stmt = def_stmt;
6432             }
6433         }
6434     }
6435
6436   if (!vectype_in)
6437     vectype_in = vectype_out;
6438
6439   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6440      directy used in stmt.  */
6441   if (reduc_index == -1)
6442     {
6443       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6444         {
6445           if (dump_enabled_p ())
6446             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6447                              "in-order reduction chain without SLP.\n");
6448           return false;
6449         }
6450
6451       if (orig_stmt)
6452         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6453       else
6454         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6455     }
6456
6457   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6458     return false;
6459
6460   if (!(reduc_index == -1
6461         || dts[reduc_index] == vect_reduction_def
6462         || dts[reduc_index] == vect_nested_cycle
6463         || ((dts[reduc_index] == vect_internal_def
6464              || dts[reduc_index] == vect_external_def
6465              || dts[reduc_index] == vect_constant_def
6466              || dts[reduc_index] == vect_induction_def)
6467             && nested_cycle && found_nested_cycle_def)))
6468     {
6469       /* For pattern recognized stmts, orig_stmt might be a reduction,
6470          but some helper statements for the pattern might not, or
6471          might be COND_EXPRs with reduction uses in the condition.  */
6472       gcc_assert (orig_stmt);
6473       return false;
6474     }
6475
6476   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6477   enum vect_reduction_type v_reduc_type
6478     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6479   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6480
6481   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6482   /* If we have a condition reduction, see if we can simplify it further.  */
6483   if (v_reduc_type == COND_REDUCTION)
6484     {
6485       /* TODO: We can't yet handle reduction chains, since we need to treat
6486          each COND_EXPR in the chain specially, not just the last one.
6487          E.g. for:
6488
6489             x_1 = PHI <x_3, ...>
6490             x_2 = a_2 ? ... : x_1;
6491             x_3 = a_3 ? ... : x_2;
6492
6493          we're interested in the last element in x_3 for which a_2 || a_3
6494          is true, whereas the current reduction chain handling would
6495          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6496          as a reduction operation.  */
6497       if (reduc_index == -1)
6498         {
6499           if (dump_enabled_p ())
6500             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6501                              "conditional reduction chains not supported\n");
6502           return false;
6503         }
6504
6505       /* vect_is_simple_reduction ensured that operand 2 is the
6506          loop-carried operand.  */
6507       gcc_assert (reduc_index == 2);
6508
6509       /* Loop peeling modifies initial value of reduction PHI, which
6510          makes the reduction stmt to be transformed different to the
6511          original stmt analyzed.  We need to record reduction code for
6512          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6513          it can be used directly at transform stage.  */
6514       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6515           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6516         {
6517           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6518           gcc_assert (cond_reduc_dt == vect_constant_def);
6519           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6520         }
6521       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6522                                                vectype_in, OPTIMIZE_FOR_SPEED))
6523         {
6524           if (dump_enabled_p ())
6525             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6526                              "optimizing condition reduction with"
6527                              " FOLD_EXTRACT_LAST.\n");
6528           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6529         }
6530       else if (cond_reduc_dt == vect_induction_def)
6531         {
6532           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6533           tree base
6534             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6535           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6536
6537           gcc_assert (TREE_CODE (base) == INTEGER_CST
6538                       && TREE_CODE (step) == INTEGER_CST);
6539           cond_reduc_val = NULL_TREE;
6540           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6541              above base; punt if base is the minimum value of the type for
6542              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6543           if (tree_int_cst_sgn (step) == -1)
6544             {
6545               cond_reduc_op_code = MIN_EXPR;
6546               if (tree_int_cst_sgn (base) == -1)
6547                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6548               else if (tree_int_cst_lt (base,
6549                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6550                 cond_reduc_val
6551                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6552             }
6553           else
6554             {
6555               cond_reduc_op_code = MAX_EXPR;
6556               if (tree_int_cst_sgn (base) == 1)
6557                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6558               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6559                                         base))
6560                 cond_reduc_val
6561                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6562             }
6563           if (cond_reduc_val)
6564             {
6565               if (dump_enabled_p ())
6566                 dump_printf_loc (MSG_NOTE, vect_location,
6567                                  "condition expression based on "
6568                                  "integer induction.\n");
6569               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6570                 = INTEGER_INDUC_COND_REDUCTION;
6571             }
6572         }
6573       else if (cond_reduc_dt == vect_constant_def)
6574         {
6575           enum vect_def_type cond_initial_dt;
6576           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6577           tree cond_initial_val
6578             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6579
6580           gcc_assert (cond_reduc_val != NULL_TREE);
6581           vect_is_simple_use (cond_initial_val, loop_vinfo,
6582                               &def_stmt, &cond_initial_dt);
6583           if (cond_initial_dt == vect_constant_def
6584               && types_compatible_p (TREE_TYPE (cond_initial_val),
6585                                      TREE_TYPE (cond_reduc_val)))
6586             {
6587               tree e = fold_binary (LE_EXPR, boolean_type_node,
6588                                     cond_initial_val, cond_reduc_val);
6589               if (e && (integer_onep (e) || integer_zerop (e)))
6590                 {
6591                   if (dump_enabled_p ())
6592                     dump_printf_loc (MSG_NOTE, vect_location,
6593                                      "condition expression based on "
6594                                      "compile time constant.\n");
6595                   /* Record reduction code at analysis stage.  */
6596                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6597                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6598                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6599                     = CONST_COND_REDUCTION;
6600                 }
6601             }
6602         }
6603     }
6604
6605   if (orig_stmt)
6606     gcc_assert (tmp == orig_stmt
6607                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6608   else
6609     /* We changed STMT to be the first stmt in reduction chain, hence we
6610        check that in this case the first element in the chain is STMT.  */
6611     gcc_assert (stmt == tmp
6612                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6613
6614   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6615     return false;
6616
6617   if (slp_node)
6618     ncopies = 1;
6619   else
6620     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6621
6622   gcc_assert (ncopies >= 1);
6623
6624   vec_mode = TYPE_MODE (vectype_in);
6625   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6626
6627   if (code == COND_EXPR)
6628     {
6629       /* Only call during the analysis stage, otherwise we'll lose
6630          STMT_VINFO_TYPE.  */
6631       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6632                                                 ops[reduc_index], 0, NULL,
6633                                                 cost_vec))
6634         {
6635           if (dump_enabled_p ())
6636             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6637                              "unsupported condition in reduction\n");
6638           return false;
6639         }
6640     }
6641   else
6642     {
6643       /* 4. Supportable by target?  */
6644
6645       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6646           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6647         {
6648           /* Shifts and rotates are only supported by vectorizable_shifts,
6649              not vectorizable_reduction.  */
6650           if (dump_enabled_p ())
6651             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6652                              "unsupported shift or rotation.\n");
6653           return false;
6654         }
6655
6656       /* 4.1. check support for the operation in the loop  */
6657       optab = optab_for_tree_code (code, vectype_in, optab_default);
6658       if (!optab)
6659         {
6660           if (dump_enabled_p ())
6661             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6662                              "no optab.\n");
6663
6664           return false;
6665         }
6666
6667       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6668         {
6669           if (dump_enabled_p ())
6670             dump_printf (MSG_NOTE, "op not supported by target.\n");
6671
6672           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6673               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6674             return false;
6675
6676           if (dump_enabled_p ())
6677             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6678         }
6679
6680       /* Worthwhile without SIMD support?  */
6681       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6682           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6683         {
6684           if (dump_enabled_p ())
6685             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6686                              "not worthwhile without SIMD support.\n");
6687
6688           return false;
6689         }
6690     }
6691
6692   /* 4.2. Check support for the epilog operation.
6693
6694           If STMT represents a reduction pattern, then the type of the
6695           reduction variable may be different than the type of the rest
6696           of the arguments.  For example, consider the case of accumulation
6697           of shorts into an int accumulator; The original code:
6698                         S1: int_a = (int) short_a;
6699           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6700
6701           was replaced with:
6702                         STMT: int_acc = widen_sum <short_a, int_acc>
6703
6704           This means that:
6705           1. The tree-code that is used to create the vector operation in the
6706              epilog code (that reduces the partial results) is not the
6707              tree-code of STMT, but is rather the tree-code of the original
6708              stmt from the pattern that STMT is replacing.  I.e, in the example
6709              above we want to use 'widen_sum' in the loop, but 'plus' in the
6710              epilog.
6711           2. The type (mode) we use to check available target support
6712              for the vector operation to be created in the *epilog*, is
6713              determined by the type of the reduction variable (in the example
6714              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6715              However the type (mode) we use to check available target support
6716              for the vector operation to be created *inside the loop*, is
6717              determined by the type of the other arguments to STMT (in the
6718              example we'd check this: optab_handler (widen_sum_optab,
6719              vect_short_mode)).
6720
6721           This is contrary to "regular" reductions, in which the types of all
6722           the arguments are the same as the type of the reduction variable.
6723           For "regular" reductions we can therefore use the same vector type
6724           (and also the same tree-code) when generating the epilog code and
6725           when generating the code inside the loop.  */
6726
6727   vect_reduction_type reduction_type
6728     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6729   if (orig_stmt
6730       && (reduction_type == TREE_CODE_REDUCTION
6731           || reduction_type == FOLD_LEFT_REDUCTION))
6732     {
6733       /* This is a reduction pattern: get the vectype from the type of the
6734          reduction variable, and get the tree-code from orig_stmt.  */
6735       orig_code = gimple_assign_rhs_code (orig_stmt);
6736       gcc_assert (vectype_out);
6737       vec_mode = TYPE_MODE (vectype_out);
6738     }
6739   else
6740     {
6741       /* Regular reduction: use the same vectype and tree-code as used for
6742          the vector code inside the loop can be used for the epilog code. */
6743       orig_code = code;
6744
6745       if (code == MINUS_EXPR)
6746         orig_code = PLUS_EXPR;
6747
6748       /* For simple condition reductions, replace with the actual expression
6749          we want to base our reduction around.  */
6750       if (reduction_type == CONST_COND_REDUCTION)
6751         {
6752           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6753           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6754         }
6755       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6756         orig_code = cond_reduc_op_code;
6757     }
6758
6759   if (nested_cycle)
6760     {
6761       def_bb = gimple_bb (reduc_def_stmt);
6762       def_stmt_loop = def_bb->loop_father;
6763       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6764                                        loop_preheader_edge (def_stmt_loop));
6765       if (TREE_CODE (def_arg) == SSA_NAME
6766           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6767           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6768           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6769           && vinfo_for_stmt (def_arg_stmt)
6770           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6771               == vect_double_reduction_def)
6772         double_reduc = true;
6773     }
6774
6775   reduc_fn = IFN_LAST;
6776
6777   if (reduction_type == TREE_CODE_REDUCTION
6778       || reduction_type == FOLD_LEFT_REDUCTION
6779       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6780       || reduction_type == CONST_COND_REDUCTION)
6781     {
6782       if (reduction_type == FOLD_LEFT_REDUCTION
6783           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6784           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6785         {
6786           if (reduc_fn != IFN_LAST
6787               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6788                                                   OPTIMIZE_FOR_SPEED))
6789             {
6790               if (dump_enabled_p ())
6791                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6792                                  "reduc op not supported by target.\n");
6793
6794               reduc_fn = IFN_LAST;
6795             }
6796         }
6797       else
6798         {
6799           if (!nested_cycle || double_reduc)
6800             {
6801               if (dump_enabled_p ())
6802                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6803                                  "no reduc code for scalar code.\n");
6804
6805               return false;
6806             }
6807         }
6808     }
6809   else if (reduction_type == COND_REDUCTION)
6810     {
6811       int scalar_precision
6812         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6813       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6814       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6815                                                 nunits_out);
6816
6817       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6818                                           OPTIMIZE_FOR_SPEED))
6819         reduc_fn = IFN_REDUC_MAX;
6820     }
6821
6822   if (reduction_type != EXTRACT_LAST_REDUCTION
6823       && reduc_fn == IFN_LAST
6824       && !nunits_out.is_constant ())
6825     {
6826       if (dump_enabled_p ())
6827         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6828                          "missing target support for reduction on"
6829                          " variable-length vectors.\n");
6830       return false;
6831     }
6832
6833   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6834       && ncopies > 1)
6835     {
6836       if (dump_enabled_p ())
6837         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6838                          "multiple types in double reduction or condition "
6839                          "reduction.\n");
6840       return false;
6841     }
6842
6843   /* For SLP reductions, see if there is a neutral value we can use.  */
6844   tree neutral_op = NULL_TREE;
6845   if (slp_node)
6846     neutral_op
6847       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
6848                                       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6849
6850   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6851     {
6852       /* We can't support in-order reductions of code such as this:
6853
6854            for (int i = 0; i < n1; ++i)
6855              for (int j = 0; j < n2; ++j)
6856                l += a[j];
6857
6858          since GCC effectively transforms the loop when vectorizing:
6859
6860            for (int i = 0; i < n1 / VF; ++i)
6861              for (int j = 0; j < n2; ++j)
6862                for (int k = 0; k < VF; ++k)
6863                  l += a[j];
6864
6865          which is a reassociation of the original operation.  */
6866       if (dump_enabled_p ())
6867         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868                          "in-order double reduction not supported.\n");
6869
6870       return false;
6871     }
6872
6873   if (reduction_type == FOLD_LEFT_REDUCTION
6874       && slp_node
6875       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6876     {
6877       /* We cannot use in-order reductions in this case because there is
6878          an implicit reassociation of the operations involved.  */
6879       if (dump_enabled_p ())
6880         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881                          "in-order unchained SLP reductions not supported.\n");
6882       return false;
6883     }
6884
6885   /* For double reductions, and for SLP reductions with a neutral value,
6886      we construct a variable-length initial vector by loading a vector
6887      full of the neutral value and then shift-and-inserting the start
6888      values into the low-numbered elements.  */
6889   if ((double_reduc || neutral_op)
6890       && !nunits_out.is_constant ()
6891       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6892                                           vectype_out, OPTIMIZE_FOR_SPEED))
6893     {
6894       if (dump_enabled_p ())
6895         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6896                          "reduction on variable-length vectors requires"
6897                          " target support for a vector-shift-and-insert"
6898                          " operation.\n");
6899       return false;
6900     }
6901
6902   /* Check extra constraints for variable-length unchained SLP reductions.  */
6903   if (STMT_SLP_TYPE (stmt_info)
6904       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6905       && !nunits_out.is_constant ())
6906     {
6907       /* We checked above that we could build the initial vector when
6908          there's a neutral element value.  Check here for the case in
6909          which each SLP statement has its own initial value and in which
6910          that value needs to be repeated for every instance of the
6911          statement within the initial vector.  */
6912       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6913       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6914       if (!neutral_op
6915           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6916         {
6917           if (dump_enabled_p ())
6918             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6919                              "unsupported form of SLP reduction for"
6920                              " variable-length vectors: cannot build"
6921                              " initial vector.\n");
6922           return false;
6923         }
6924       /* The epilogue code relies on the number of elements being a multiple
6925          of the group size.  The duplicate-and-interleave approach to setting
6926          up the the initial vector does too.  */
6927       if (!multiple_p (nunits_out, group_size))
6928         {
6929           if (dump_enabled_p ())
6930             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6931                              "unsupported form of SLP reduction for"
6932                              " variable-length vectors: the vector size"
6933                              " is not a multiple of the number of results.\n");
6934           return false;
6935         }
6936     }
6937
6938   /* In case of widenning multiplication by a constant, we update the type
6939      of the constant to be the type of the other operand.  We check that the
6940      constant fits the type in the pattern recognition pass.  */
6941   if (code == DOT_PROD_EXPR
6942       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6943     {
6944       if (TREE_CODE (ops[0]) == INTEGER_CST)
6945         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6946       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6947         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6948       else
6949         {
6950           if (dump_enabled_p ())
6951             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6952                              "invalid types in dot-prod\n");
6953
6954           return false;
6955         }
6956     }
6957
6958   if (reduction_type == COND_REDUCTION)
6959     {
6960       widest_int ni;
6961
6962       if (! max_loop_iterations (loop, &ni))
6963         {
6964           if (dump_enabled_p ())
6965             dump_printf_loc (MSG_NOTE, vect_location,
6966                              "loop count not known, cannot create cond "
6967                              "reduction.\n");
6968           return false;
6969         }
6970       /* Convert backedges to iterations.  */
6971       ni += 1;
6972
6973       /* The additional index will be the same type as the condition.  Check
6974          that the loop can fit into this less one (because we'll use up the
6975          zero slot for when there are no matches).  */
6976       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6977       if (wi::geu_p (ni, wi::to_widest (max_index)))
6978         {
6979           if (dump_enabled_p ())
6980             dump_printf_loc (MSG_NOTE, vect_location,
6981                              "loop size is greater than data size.\n");
6982           return false;
6983         }
6984     }
6985
6986   /* In case the vectorization factor (VF) is bigger than the number
6987      of elements that we can fit in a vectype (nunits), we have to generate
6988      more than one vector stmt - i.e - we need to "unroll" the
6989      vector stmt by a factor VF/nunits.  For more details see documentation
6990      in vectorizable_operation.  */
6991
6992   /* If the reduction is used in an outer loop we need to generate
6993      VF intermediate results, like so (e.g. for ncopies=2):
6994         r0 = phi (init, r0)
6995         r1 = phi (init, r1)
6996         r0 = x0 + r0;
6997         r1 = x1 + r1;
6998     (i.e. we generate VF results in 2 registers).
6999     In this case we have a separate def-use cycle for each copy, and therefore
7000     for each copy we get the vector def for the reduction variable from the
7001     respective phi node created for this copy.
7002
7003     Otherwise (the reduction is unused in the loop nest), we can combine
7004     together intermediate results, like so (e.g. for ncopies=2):
7005         r = phi (init, r)
7006         r = x0 + r;
7007         r = x1 + r;
7008    (i.e. we generate VF/2 results in a single register).
7009    In this case for each copy we get the vector def for the reduction variable
7010    from the vectorized reduction operation generated in the previous iteration.
7011
7012    This only works when we see both the reduction PHI and its only consumer
7013    in vectorizable_reduction and there are no intermediate stmts
7014    participating.  */
7015   use_operand_p use_p;
7016   gimple *use_stmt;
7017   if (ncopies > 1
7018       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7019       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7020       && (use_stmt == stmt
7021           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7022     {
7023       single_defuse_cycle = true;
7024       epilog_copies = 1;
7025     }
7026   else
7027     epilog_copies = ncopies;
7028
7029   /* If the reduction stmt is one of the patterns that have lane
7030      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7031   if ((ncopies > 1
7032        && ! single_defuse_cycle)
7033       && (code == DOT_PROD_EXPR
7034           || code == WIDEN_SUM_EXPR
7035           || code == SAD_EXPR))
7036     {
7037       if (dump_enabled_p ())
7038         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7039                          "multi def-use cycle not possible for lane-reducing "
7040                          "reduction operation\n");
7041       return false;
7042     }
7043
7044   if (slp_node)
7045     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7046   else
7047     vec_num = 1;
7048
7049   internal_fn cond_fn = get_conditional_internal_fn (code);
7050   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7051
7052   if (!vec_stmt) /* transformation not required.  */
7053     {
7054       if (first_p)
7055         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7056       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7057         {
7058           if (reduction_type != FOLD_LEFT_REDUCTION
7059               && (cond_fn == IFN_LAST
7060                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7061                                                       OPTIMIZE_FOR_SPEED)))
7062             {
7063               if (dump_enabled_p ())
7064                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7065                                  "can't use a fully-masked loop because no"
7066                                  " conditional operation is available.\n");
7067               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7068             }
7069           else if (reduc_index == -1)
7070             {
7071               if (dump_enabled_p ())
7072                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7073                                  "can't use a fully-masked loop for chained"
7074                                  " reductions.\n");
7075               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7076             }
7077           else
7078             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7079                                    vectype_in);
7080         }
7081       if (dump_enabled_p ()
7082           && reduction_type == FOLD_LEFT_REDUCTION)
7083         dump_printf_loc (MSG_NOTE, vect_location,
7084                          "using an in-order (fold-left) reduction.\n");
7085       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7086       return true;
7087     }
7088
7089   /* Transform.  */
7090
7091   if (dump_enabled_p ())
7092     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7093
7094   /* FORNOW: Multiple types are not supported for condition.  */
7095   if (code == COND_EXPR)
7096     gcc_assert (ncopies == 1);
7097
7098   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7099
7100   if (reduction_type == FOLD_LEFT_REDUCTION)
7101     return vectorize_fold_left_reduction
7102       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7103        reduc_fn, ops, vectype_in, reduc_index, masks);
7104
7105   if (reduction_type == EXTRACT_LAST_REDUCTION)
7106     {
7107       gcc_assert (!slp_node);
7108       return vectorizable_condition (stmt, gsi, vec_stmt,
7109                                      NULL, reduc_index, NULL, NULL);
7110     }
7111
7112   /* Create the destination vector  */
7113   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7114
7115   prev_stmt_info = NULL;
7116   prev_phi_info = NULL;
7117   if (!slp_node)
7118     {
7119       vec_oprnds0.create (1);
7120       vec_oprnds1.create (1);
7121       if (op_type == ternary_op)
7122         vec_oprnds2.create (1);
7123     }
7124
7125   phis.create (vec_num);
7126   vect_defs.create (vec_num);
7127   if (!slp_node)
7128     vect_defs.quick_push (NULL_TREE);
7129
7130   if (slp_node)
7131     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7132   else
7133     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7134
7135   for (j = 0; j < ncopies; j++)
7136     {
7137       if (code == COND_EXPR)
7138         {
7139           gcc_assert (!slp_node);
7140           vectorizable_condition (stmt, gsi, vec_stmt,
7141                                   PHI_RESULT (phis[0]),
7142                                   reduc_index, NULL, NULL);
7143           /* Multiple types are not supported for condition.  */
7144           break;
7145         }
7146
7147       /* Handle uses.  */
7148       if (j == 0)
7149         {
7150           if (slp_node)
7151             {
7152               /* Get vec defs for all the operands except the reduction index,
7153                  ensuring the ordering of the ops in the vector is kept.  */
7154               auto_vec<tree, 3> slp_ops;
7155               auto_vec<vec<tree>, 3> vec_defs;
7156
7157               slp_ops.quick_push (ops[0]);
7158               slp_ops.quick_push (ops[1]);
7159               if (op_type == ternary_op)
7160                 slp_ops.quick_push (ops[2]);
7161
7162               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7163
7164               vec_oprnds0.safe_splice (vec_defs[0]);
7165               vec_defs[0].release ();
7166               vec_oprnds1.safe_splice (vec_defs[1]);
7167               vec_defs[1].release ();
7168               if (op_type == ternary_op)
7169                 {
7170                   vec_oprnds2.safe_splice (vec_defs[2]);
7171                   vec_defs[2].release ();
7172                 }
7173             }
7174           else
7175             {
7176               vec_oprnds0.quick_push
7177                 (vect_get_vec_def_for_operand (ops[0], stmt));
7178               vec_oprnds1.quick_push
7179                 (vect_get_vec_def_for_operand (ops[1], stmt));
7180               if (op_type == ternary_op)
7181                 vec_oprnds2.quick_push
7182                   (vect_get_vec_def_for_operand (ops[2], stmt));
7183             }
7184         }
7185       else
7186         {
7187           if (!slp_node)
7188             {
7189               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7190
7191               if (single_defuse_cycle && reduc_index == 0)
7192                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7193               else
7194                 vec_oprnds0[0]
7195                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7196               if (single_defuse_cycle && reduc_index == 1)
7197                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7198               else
7199                 vec_oprnds1[0]
7200                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7201               if (op_type == ternary_op)
7202                 {
7203                   if (single_defuse_cycle && reduc_index == 2)
7204                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7205                   else
7206                     vec_oprnds2[0]
7207                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7208                 }
7209             }
7210         }
7211
7212       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7213         {
7214           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7215           if (masked_loop_p)
7216             {
7217               /* Make sure that the reduction accumulator is vop[0].  */
7218               if (reduc_index == 1)
7219                 {
7220                   gcc_assert (commutative_tree_code (code));
7221                   std::swap (vop[0], vop[1]);
7222                 }
7223               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7224                                               vectype_in, i * ncopies + j);
7225               gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7226                                                         vop[0], vop[1]);
7227               new_temp = make_ssa_name (vec_dest, call);
7228               gimple_call_set_lhs (call, new_temp);
7229               gimple_call_set_nothrow (call, true);
7230               new_stmt = call;
7231             }
7232           else
7233             {
7234               if (op_type == ternary_op)
7235                 vop[2] = vec_oprnds2[i];
7236
7237               new_temp = make_ssa_name (vec_dest, new_stmt);
7238               new_stmt = gimple_build_assign (new_temp, code,
7239                                               vop[0], vop[1], vop[2]);
7240             }
7241           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7242
7243           if (slp_node)
7244             {
7245               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7246               vect_defs.quick_push (new_temp);
7247             }
7248           else
7249             vect_defs[0] = new_temp;
7250         }
7251
7252       if (slp_node)
7253         continue;
7254
7255       if (j == 0)
7256         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7257       else
7258         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7259
7260       prev_stmt_info = vinfo_for_stmt (new_stmt);
7261     }
7262
7263   /* Finalize the reduction-phi (set its arguments) and create the
7264      epilog reduction code.  */
7265   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7266     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7267
7268   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7269                                     epilog_copies, reduc_fn, phis,
7270                                     double_reduc, slp_node, slp_node_instance,
7271                                     cond_reduc_val, cond_reduc_op_code,
7272                                     neutral_op);
7273
7274   return true;
7275 }
7276
7277 /* Function vect_min_worthwhile_factor.
7278
7279    For a loop where we could vectorize the operation indicated by CODE,
7280    return the minimum vectorization factor that makes it worthwhile
7281    to use generic vectors.  */
7282 static unsigned int
7283 vect_min_worthwhile_factor (enum tree_code code)
7284 {
7285   switch (code)
7286     {
7287     case PLUS_EXPR:
7288     case MINUS_EXPR:
7289     case NEGATE_EXPR:
7290       return 4;
7291
7292     case BIT_AND_EXPR:
7293     case BIT_IOR_EXPR:
7294     case BIT_XOR_EXPR:
7295     case BIT_NOT_EXPR:
7296       return 2;
7297
7298     default:
7299       return INT_MAX;
7300     }
7301 }
7302
7303 /* Return true if VINFO indicates we are doing loop vectorization and if
7304    it is worth decomposing CODE operations into scalar operations for
7305    that loop's vectorization factor.  */
7306
7307 bool
7308 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7309 {
7310   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7311   unsigned HOST_WIDE_INT value;
7312   return (loop_vinfo
7313           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7314           && value >= vect_min_worthwhile_factor (code));
7315 }
7316
7317 /* Function vectorizable_induction
7318
7319    Check if PHI performs an induction computation that can be vectorized.
7320    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7321    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7322    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7323
7324 bool
7325 vectorizable_induction (gimple *phi,
7326                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7327                         gimple **vec_stmt, slp_tree slp_node,
7328                         stmt_vector_for_cost *cost_vec)
7329 {
7330   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7331   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7332   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7333   unsigned ncopies;
7334   bool nested_in_vect_loop = false;
7335   struct loop *iv_loop;
7336   tree vec_def;
7337   edge pe = loop_preheader_edge (loop);
7338   basic_block new_bb;
7339   tree new_vec, vec_init, vec_step, t;
7340   tree new_name;
7341   gimple *new_stmt;
7342   gphi *induction_phi;
7343   tree induc_def, vec_dest;
7344   tree init_expr, step_expr;
7345   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7346   unsigned i;
7347   tree expr;
7348   gimple_seq stmts;
7349   imm_use_iterator imm_iter;
7350   use_operand_p use_p;
7351   gimple *exit_phi;
7352   edge latch_e;
7353   tree loop_arg;
7354   gimple_stmt_iterator si;
7355   basic_block bb = gimple_bb (phi);
7356
7357   if (gimple_code (phi) != GIMPLE_PHI)
7358     return false;
7359
7360   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7361     return false;
7362
7363   /* Make sure it was recognized as induction computation.  */
7364   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7365     return false;
7366
7367   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7368   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7369
7370   if (slp_node)
7371     ncopies = 1;
7372   else
7373     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7374   gcc_assert (ncopies >= 1);
7375
7376   /* FORNOW. These restrictions should be relaxed.  */
7377   if (nested_in_vect_loop_p (loop, phi))
7378     {
7379       imm_use_iterator imm_iter;
7380       use_operand_p use_p;
7381       gimple *exit_phi;
7382       edge latch_e;
7383       tree loop_arg;
7384
7385       if (ncopies > 1)
7386         {
7387           if (dump_enabled_p ())
7388             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7389                              "multiple types in nested loop.\n");
7390           return false;
7391         }
7392
7393       /* FORNOW: outer loop induction with SLP not supported.  */
7394       if (STMT_SLP_TYPE (stmt_info))
7395         return false;
7396
7397       exit_phi = NULL;
7398       latch_e = loop_latch_edge (loop->inner);
7399       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7400       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7401         {
7402           gimple *use_stmt = USE_STMT (use_p);
7403           if (is_gimple_debug (use_stmt))
7404             continue;
7405
7406           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7407             {
7408               exit_phi = use_stmt;
7409               break;
7410             }
7411         }
7412       if (exit_phi)
7413         {
7414           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7415           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7416                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7417             {
7418               if (dump_enabled_p ())
7419                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7420                                  "inner-loop induction only used outside "
7421                                  "of the outer vectorized loop.\n");
7422               return false;
7423             }
7424         }
7425
7426       nested_in_vect_loop = true;
7427       iv_loop = loop->inner;
7428     }
7429   else
7430     iv_loop = loop;
7431   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7432
7433   if (slp_node && !nunits.is_constant ())
7434     {
7435       /* The current SLP code creates the initial value element-by-element.  */
7436       if (dump_enabled_p ())
7437         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7438                          "SLP induction not supported for variable-length"
7439                          " vectors.\n");
7440       return false;
7441     }
7442
7443   if (!vec_stmt) /* transformation not required.  */
7444     {
7445       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7446       if (dump_enabled_p ())
7447         dump_printf_loc (MSG_NOTE, vect_location,
7448                          "=== vectorizable_induction ===\n");
7449       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7450       return true;
7451     }
7452
7453   /* Transform.  */
7454
7455   /* Compute a vector variable, initialized with the first VF values of
7456      the induction variable.  E.g., for an iv with IV_PHI='X' and
7457      evolution S, for a vector of 4 units, we want to compute:
7458      [X, X + S, X + 2*S, X + 3*S].  */
7459
7460   if (dump_enabled_p ())
7461     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7462
7463   latch_e = loop_latch_edge (iv_loop);
7464   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7465
7466   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7467   gcc_assert (step_expr != NULL_TREE);
7468
7469   pe = loop_preheader_edge (iv_loop);
7470   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7471                                      loop_preheader_edge (iv_loop));
7472
7473   stmts = NULL;
7474   if (!nested_in_vect_loop)
7475     {
7476       /* Convert the initial value to the desired type.  */
7477       tree new_type = TREE_TYPE (vectype);
7478       init_expr = gimple_convert (&stmts, new_type, init_expr);
7479
7480       /* If we are using the loop mask to "peel" for alignment then we need
7481          to adjust the start value here.  */
7482       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7483       if (skip_niters != NULL_TREE)
7484         {
7485           if (FLOAT_TYPE_P (vectype))
7486             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7487                                         skip_niters);
7488           else
7489             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7490           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7491                                          skip_niters, step_expr);
7492           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7493                                     init_expr, skip_step);
7494         }
7495     }
7496
7497   /* Convert the step to the desired type.  */
7498   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7499
7500   if (stmts)
7501     {
7502       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7503       gcc_assert (!new_bb);
7504     }
7505
7506   /* Find the first insertion point in the BB.  */
7507   si = gsi_after_labels (bb);
7508
7509   /* For SLP induction we have to generate several IVs as for example
7510      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7511      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7512      [VF*S, VF*S, VF*S, VF*S] for all.  */
7513   if (slp_node)
7514     {
7515       /* Enforced above.  */
7516       unsigned int const_nunits = nunits.to_constant ();
7517
7518       /* Generate [VF*S, VF*S, ... ].  */
7519       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7520         {
7521           expr = build_int_cst (integer_type_node, vf);
7522           expr = fold_convert (TREE_TYPE (step_expr), expr);
7523         }
7524       else
7525         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7526       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7527                               expr, step_expr);
7528       if (! CONSTANT_CLASS_P (new_name))
7529         new_name = vect_init_vector (phi, new_name,
7530                                      TREE_TYPE (step_expr), NULL);
7531       new_vec = build_vector_from_val (vectype, new_name);
7532       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7533
7534       /* Now generate the IVs.  */
7535       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7536       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7537       unsigned elts = const_nunits * nvects;
7538       unsigned nivs = least_common_multiple (group_size,
7539                                              const_nunits) / const_nunits;
7540       gcc_assert (elts % group_size == 0);
7541       tree elt = init_expr;
7542       unsigned ivn;
7543       for (ivn = 0; ivn < nivs; ++ivn)
7544         {
7545           tree_vector_builder elts (vectype, const_nunits, 1);
7546           stmts = NULL;
7547           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7548             {
7549               if (ivn*const_nunits + eltn >= group_size
7550                   && (ivn * const_nunits + eltn) % group_size == 0)
7551                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7552                                     elt, step_expr);
7553               elts.quick_push (elt);
7554             }
7555           vec_init = gimple_build_vector (&stmts, &elts);
7556           if (stmts)
7557             {
7558               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7559               gcc_assert (!new_bb);
7560             }
7561
7562           /* Create the induction-phi that defines the induction-operand.  */
7563           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7564           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7565           set_vinfo_for_stmt (induction_phi,
7566                               new_stmt_vec_info (induction_phi, loop_vinfo));
7567           induc_def = PHI_RESULT (induction_phi);
7568
7569           /* Create the iv update inside the loop  */
7570           vec_def = make_ssa_name (vec_dest);
7571           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7572           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7573           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7574
7575           /* Set the arguments of the phi node:  */
7576           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7577           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7578                        UNKNOWN_LOCATION);
7579
7580           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7581         }
7582
7583       /* Re-use IVs when we can.  */
7584       if (ivn < nvects)
7585         {
7586           unsigned vfp
7587             = least_common_multiple (group_size, const_nunits) / group_size;
7588           /* Generate [VF'*S, VF'*S, ... ].  */
7589           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7590             {
7591               expr = build_int_cst (integer_type_node, vfp);
7592               expr = fold_convert (TREE_TYPE (step_expr), expr);
7593             }
7594           else
7595             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7596           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7597                                   expr, step_expr);
7598           if (! CONSTANT_CLASS_P (new_name))
7599             new_name = vect_init_vector (phi, new_name,
7600                                          TREE_TYPE (step_expr), NULL);
7601           new_vec = build_vector_from_val (vectype, new_name);
7602           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7603           for (; ivn < nvects; ++ivn)
7604             {
7605               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7606               tree def;
7607               if (gimple_code (iv) == GIMPLE_PHI)
7608                 def = gimple_phi_result (iv);
7609               else
7610                 def = gimple_assign_lhs (iv);
7611               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7612                                               PLUS_EXPR,
7613                                               def, vec_step);
7614               if (gimple_code (iv) == GIMPLE_PHI)
7615                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7616               else
7617                 {
7618                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7619                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7620                 }
7621               set_vinfo_for_stmt (new_stmt,
7622                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7623               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7624             }
7625         }
7626
7627       return true;
7628     }
7629
7630   /* Create the vector that holds the initial_value of the induction.  */
7631   if (nested_in_vect_loop)
7632     {
7633       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7634          been created during vectorization of previous stmts.  We obtain it
7635          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7636       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7637       /* If the initial value is not of proper type, convert it.  */
7638       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7639         {
7640           new_stmt
7641             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7642                                                           vect_simple_var,
7643                                                           "vec_iv_"),
7644                                    VIEW_CONVERT_EXPR,
7645                                    build1 (VIEW_CONVERT_EXPR, vectype,
7646                                            vec_init));
7647           vec_init = gimple_assign_lhs (new_stmt);
7648           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7649                                                  new_stmt);
7650           gcc_assert (!new_bb);
7651           set_vinfo_for_stmt (new_stmt,
7652                               new_stmt_vec_info (new_stmt, loop_vinfo));
7653         }
7654     }
7655   else
7656     {
7657       /* iv_loop is the loop to be vectorized. Create:
7658          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7659       stmts = NULL;
7660       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7661
7662       unsigned HOST_WIDE_INT const_nunits;
7663       if (nunits.is_constant (&const_nunits))
7664         {
7665           tree_vector_builder elts (vectype, const_nunits, 1);
7666           elts.quick_push (new_name);
7667           for (i = 1; i < const_nunits; i++)
7668             {
7669               /* Create: new_name_i = new_name + step_expr  */
7670               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7671                                        new_name, step_expr);
7672               elts.quick_push (new_name);
7673             }
7674           /* Create a vector from [new_name_0, new_name_1, ...,
7675              new_name_nunits-1]  */
7676           vec_init = gimple_build_vector (&stmts, &elts);
7677         }
7678       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7679         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7680         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7681                                  new_name, step_expr);
7682       else
7683         {
7684           /* Build:
7685                 [base, base, base, ...]
7686                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7687           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7688           gcc_assert (flag_associative_math);
7689           tree index = build_index_vector (vectype, 0, 1);
7690           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7691                                                         new_name);
7692           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7693                                                         step_expr);
7694           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7695           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7696                                    vec_init, step_vec);
7697           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7698                                    vec_init, base_vec);
7699         }
7700
7701       if (stmts)
7702         {
7703           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7704           gcc_assert (!new_bb);
7705         }
7706     }
7707
7708
7709   /* Create the vector that holds the step of the induction.  */
7710   if (nested_in_vect_loop)
7711     /* iv_loop is nested in the loop to be vectorized. Generate:
7712        vec_step = [S, S, S, S]  */
7713     new_name = step_expr;
7714   else
7715     {
7716       /* iv_loop is the loop to be vectorized. Generate:
7717           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7718       gimple_seq seq = NULL;
7719       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7720         {
7721           expr = build_int_cst (integer_type_node, vf);
7722           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7723         }
7724       else
7725         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7726       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7727                                expr, step_expr);
7728       if (seq)
7729         {
7730           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7731           gcc_assert (!new_bb);
7732         }
7733     }
7734
7735   t = unshare_expr (new_name);
7736   gcc_assert (CONSTANT_CLASS_P (new_name)
7737               || TREE_CODE (new_name) == SSA_NAME);
7738   new_vec = build_vector_from_val (vectype, t);
7739   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7740
7741
7742   /* Create the following def-use cycle:
7743      loop prolog:
7744          vec_init = ...
7745          vec_step = ...
7746      loop:
7747          vec_iv = PHI <vec_init, vec_loop>
7748          ...
7749          STMT
7750          ...
7751          vec_loop = vec_iv + vec_step;  */
7752
7753   /* Create the induction-phi that defines the induction-operand.  */
7754   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7755   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7756   set_vinfo_for_stmt (induction_phi,
7757                       new_stmt_vec_info (induction_phi, loop_vinfo));
7758   induc_def = PHI_RESULT (induction_phi);
7759
7760   /* Create the iv update inside the loop  */
7761   vec_def = make_ssa_name (vec_dest);
7762   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7763   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7764   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7765
7766   /* Set the arguments of the phi node:  */
7767   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7768   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7769                UNKNOWN_LOCATION);
7770
7771   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7772
7773   /* In case that vectorization factor (VF) is bigger than the number
7774      of elements that we can fit in a vectype (nunits), we have to generate
7775      more than one vector stmt - i.e - we need to "unroll" the
7776      vector stmt by a factor VF/nunits.  For more details see documentation
7777      in vectorizable_operation.  */
7778
7779   if (ncopies > 1)
7780     {
7781       gimple_seq seq = NULL;
7782       stmt_vec_info prev_stmt_vinfo;
7783       /* FORNOW. This restriction should be relaxed.  */
7784       gcc_assert (!nested_in_vect_loop);
7785
7786       /* Create the vector that holds the step of the induction.  */
7787       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7788         {
7789           expr = build_int_cst (integer_type_node, nunits);
7790           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7791         }
7792       else
7793         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7794       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7795                                expr, step_expr);
7796       if (seq)
7797         {
7798           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7799           gcc_assert (!new_bb);
7800         }
7801
7802       t = unshare_expr (new_name);
7803       gcc_assert (CONSTANT_CLASS_P (new_name)
7804                   || TREE_CODE (new_name) == SSA_NAME);
7805       new_vec = build_vector_from_val (vectype, t);
7806       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7807
7808       vec_def = induc_def;
7809       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7810       for (i = 1; i < ncopies; i++)
7811         {
7812           /* vec_i = vec_prev + vec_step  */
7813           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7814                                           vec_def, vec_step);
7815           vec_def = make_ssa_name (vec_dest, new_stmt);
7816           gimple_assign_set_lhs (new_stmt, vec_def);
7817
7818           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7819           set_vinfo_for_stmt (new_stmt,
7820                               new_stmt_vec_info (new_stmt, loop_vinfo));
7821           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7822           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7823         }
7824     }
7825
7826   if (nested_in_vect_loop)
7827     {
7828       /* Find the loop-closed exit-phi of the induction, and record
7829          the final vector of induction results:  */
7830       exit_phi = NULL;
7831       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7832         {
7833           gimple *use_stmt = USE_STMT (use_p);
7834           if (is_gimple_debug (use_stmt))
7835             continue;
7836
7837           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7838             {
7839               exit_phi = use_stmt;
7840               break;
7841             }
7842         }
7843       if (exit_phi)
7844         {
7845           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7846           /* FORNOW. Currently not supporting the case that an inner-loop induction
7847              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7848           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7849                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7850
7851           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7852           if (dump_enabled_p ())
7853             {
7854               dump_printf_loc (MSG_NOTE, vect_location,
7855                                "vector of inductions after inner-loop:");
7856               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7857             }
7858         }
7859     }
7860
7861
7862   if (dump_enabled_p ())
7863     {
7864       dump_printf_loc (MSG_NOTE, vect_location,
7865                        "transform induction: created def-use cycle: ");
7866       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7867       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7868                         SSA_NAME_DEF_STMT (vec_def), 0);
7869     }
7870
7871   return true;
7872 }
7873
7874 /* Function vectorizable_live_operation.
7875
7876    STMT computes a value that is used outside the loop.  Check if
7877    it can be supported.  */
7878
7879 bool
7880 vectorizable_live_operation (gimple *stmt,
7881                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7882                              slp_tree slp_node, int slp_index,
7883                              gimple **vec_stmt,
7884                              stmt_vector_for_cost *)
7885 {
7886   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7887   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7888   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7889   imm_use_iterator imm_iter;
7890   tree lhs, lhs_type, bitsize, vec_bitsize;
7891   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7892   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7893   int ncopies;
7894   gimple *use_stmt;
7895   auto_vec<tree> vec_oprnds;
7896   int vec_entry = 0;
7897   poly_uint64 vec_index = 0;
7898
7899   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7900
7901   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7902     return false;
7903
7904   /* FORNOW.  CHECKME.  */
7905   if (nested_in_vect_loop_p (loop, stmt))
7906     return false;
7907
7908   /* If STMT is not relevant and it is a simple assignment and its inputs are
7909      invariant then it can remain in place, unvectorized.  The original last
7910      scalar value that it computes will be used.  */
7911   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7912     {
7913       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7914       if (dump_enabled_p ())
7915         dump_printf_loc (MSG_NOTE, vect_location,
7916                          "statement is simple and uses invariant.  Leaving in "
7917                          "place.\n");
7918       return true;
7919     }
7920
7921   if (slp_node)
7922     ncopies = 1;
7923   else
7924     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7925
7926   if (slp_node)
7927     {
7928       gcc_assert (slp_index >= 0);
7929
7930       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7931       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7932
7933       /* Get the last occurrence of the scalar index from the concatenation of
7934          all the slp vectors. Calculate which slp vector it is and the index
7935          within.  */
7936       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7937
7938       /* Calculate which vector contains the result, and which lane of
7939          that vector we need.  */
7940       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7941         {
7942           if (dump_enabled_p ())
7943             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7944                              "Cannot determine which vector holds the"
7945                              " final result.\n");
7946           return false;
7947         }
7948     }
7949
7950   if (!vec_stmt)
7951     {
7952       /* No transformation required.  */
7953       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7954         {
7955           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7956                                                OPTIMIZE_FOR_SPEED))
7957             {
7958               if (dump_enabled_p ())
7959                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7960                                  "can't use a fully-masked loop because "
7961                                  "the target doesn't support extract last "
7962                                  "reduction.\n");
7963               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7964             }
7965           else if (slp_node)
7966             {
7967               if (dump_enabled_p ())
7968                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7969                                  "can't use a fully-masked loop because an "
7970                                  "SLP statement is live after the loop.\n");
7971               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7972             }
7973           else if (ncopies > 1)
7974             {
7975               if (dump_enabled_p ())
7976                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977                                  "can't use a fully-masked loop because"
7978                                  " ncopies is greater than 1.\n");
7979               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7980             }
7981           else
7982             {
7983               gcc_assert (ncopies == 1 && !slp_node);
7984               vect_record_loop_mask (loop_vinfo,
7985                                      &LOOP_VINFO_MASKS (loop_vinfo),
7986                                      1, vectype);
7987             }
7988         }
7989       return true;
7990     }
7991
7992   /* If stmt has a related stmt, then use that for getting the lhs.  */
7993   if (is_pattern_stmt_p (stmt_info))
7994     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7995
7996   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7997         : gimple_get_lhs (stmt);
7998   lhs_type = TREE_TYPE (lhs);
7999
8000   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8001              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8002              : TYPE_SIZE (TREE_TYPE (vectype)));
8003   vec_bitsize = TYPE_SIZE (vectype);
8004
8005   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8006   tree vec_lhs, bitstart;
8007   if (slp_node)
8008     {
8009       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8010
8011       /* Get the correct slp vectorized stmt.  */
8012       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8013       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8014         vec_lhs = gimple_phi_result (phi);
8015       else
8016         vec_lhs = gimple_get_lhs (vec_stmt);
8017
8018       /* Get entry to use.  */
8019       bitstart = bitsize_int (vec_index);
8020       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8021     }
8022   else
8023     {
8024       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8025       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8026       gcc_checking_assert (ncopies == 1
8027                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8028
8029       /* For multiple copies, get the last copy.  */
8030       for (int i = 1; i < ncopies; ++i)
8031         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8032                                                   vec_lhs);
8033
8034       /* Get the last lane in the vector.  */
8035       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8036     }
8037
8038   gimple_seq stmts = NULL;
8039   tree new_tree;
8040   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8041     {
8042       /* Emit:
8043
8044            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8045
8046          where VEC_LHS is the vectorized live-out result and MASK is
8047          the loop mask for the final iteration.  */
8048       gcc_assert (ncopies == 1 && !slp_node);
8049       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8050       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8051                                       1, vectype, 0);
8052       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8053                                       scalar_type, mask, vec_lhs);
8054
8055       /* Convert the extracted vector element to the required scalar type.  */
8056       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8057     }
8058   else
8059     {
8060       tree bftype = TREE_TYPE (vectype);
8061       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8062         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8063       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8064       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8065                                        &stmts, true, NULL_TREE);
8066     }
8067
8068   if (stmts)
8069     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8070
8071   /* Replace use of lhs with newly computed result.  If the use stmt is a
8072      single arg PHI, just replace all uses of PHI result.  It's necessary
8073      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8074   use_operand_p use_p;
8075   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8076     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8077         && !is_gimple_debug (use_stmt))
8078     {
8079       if (gimple_code (use_stmt) == GIMPLE_PHI
8080           && gimple_phi_num_args (use_stmt) == 1)
8081         {
8082           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8083         }
8084       else
8085         {
8086           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8087             SET_USE (use_p, new_tree);
8088         }
8089       update_stmt (use_stmt);
8090     }
8091
8092   return true;
8093 }
8094
8095 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8096
8097 static void
8098 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8099 {
8100   ssa_op_iter op_iter;
8101   imm_use_iterator imm_iter;
8102   def_operand_p def_p;
8103   gimple *ustmt;
8104
8105   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8106     {
8107       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8108         {
8109           basic_block bb;
8110
8111           if (!is_gimple_debug (ustmt))
8112             continue;
8113
8114           bb = gimple_bb (ustmt);
8115
8116           if (!flow_bb_inside_loop_p (loop, bb))
8117             {
8118               if (gimple_debug_bind_p (ustmt))
8119                 {
8120                   if (dump_enabled_p ())
8121                     dump_printf_loc (MSG_NOTE, vect_location,
8122                                      "killing debug use\n");
8123
8124                   gimple_debug_bind_reset_value (ustmt);
8125                   update_stmt (ustmt);
8126                 }
8127               else
8128                 gcc_unreachable ();
8129             }
8130         }
8131     }
8132 }
8133
8134 /* Given loop represented by LOOP_VINFO, return true if computation of
8135    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8136    otherwise.  */
8137
8138 static bool
8139 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8140 {
8141   /* Constant case.  */
8142   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8143     {
8144       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8145       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8146
8147       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8148       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8149       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8150         return true;
8151     }
8152
8153   widest_int max;
8154   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8155   /* Check the upper bound of loop niters.  */
8156   if (get_max_loop_iterations (loop, &max))
8157     {
8158       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8159       signop sgn = TYPE_SIGN (type);
8160       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8161       if (max < type_max)
8162         return true;
8163     }
8164   return false;
8165 }
8166
8167 /* Return a mask type with half the number of elements as TYPE.  */
8168
8169 tree
8170 vect_halve_mask_nunits (tree type)
8171 {
8172   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8173   return build_truth_vector_type (nunits, current_vector_size);
8174 }
8175
8176 /* Return a mask type with twice as many elements as TYPE.  */
8177
8178 tree
8179 vect_double_mask_nunits (tree type)
8180 {
8181   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8182   return build_truth_vector_type (nunits, current_vector_size);
8183 }
8184
8185 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8186    contain a sequence of NVECTORS masks that each control a vector of type
8187    VECTYPE.  */
8188
8189 void
8190 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8191                        unsigned int nvectors, tree vectype)
8192 {
8193   gcc_assert (nvectors != 0);
8194   if (masks->length () < nvectors)
8195     masks->safe_grow_cleared (nvectors);
8196   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8197   /* The number of scalars per iteration and the number of vectors are
8198      both compile-time constants.  */
8199   unsigned int nscalars_per_iter
8200     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8201                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8202   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8203     {
8204       rgm->max_nscalars_per_iter = nscalars_per_iter;
8205       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8206     }
8207 }
8208
8209 /* Given a complete set of masks MASKS, extract mask number INDEX
8210    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8211    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8212
8213    See the comment above vec_loop_masks for more details about the mask
8214    arrangement.  */
8215
8216 tree
8217 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8218                     unsigned int nvectors, tree vectype, unsigned int index)
8219 {
8220   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8221   tree mask_type = rgm->mask_type;
8222
8223   /* Populate the rgroup's mask array, if this is the first time we've
8224      used it.  */
8225   if (rgm->masks.is_empty ())
8226     {
8227       rgm->masks.safe_grow_cleared (nvectors);
8228       for (unsigned int i = 0; i < nvectors; ++i)
8229         {
8230           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8231           /* Provide a dummy definition until the real one is available.  */
8232           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8233           rgm->masks[i] = mask;
8234         }
8235     }
8236
8237   tree mask = rgm->masks[index];
8238   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8239                 TYPE_VECTOR_SUBPARTS (vectype)))
8240     {
8241       /* A loop mask for data type X can be reused for data type Y
8242          if X has N times more elements than Y and if Y's elements
8243          are N times bigger than X's.  In this case each sequence
8244          of N elements in the loop mask will be all-zero or all-one.
8245          We can then view-convert the mask so that each sequence of
8246          N elements is replaced by a single element.  */
8247       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8248                               TYPE_VECTOR_SUBPARTS (vectype)));
8249       gimple_seq seq = NULL;
8250       mask_type = build_same_sized_truth_vector_type (vectype);
8251       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8252       if (seq)
8253         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8254     }
8255   return mask;
8256 }
8257
8258 /* Scale profiling counters by estimation for LOOP which is vectorized
8259    by factor VF.  */
8260
8261 static void
8262 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8263 {
8264   edge preheader = loop_preheader_edge (loop);
8265   /* Reduce loop iterations by the vectorization factor.  */
8266   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8267   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8268
8269   if (freq_h.nonzero_p ())
8270     {
8271       profile_probability p;
8272
8273       /* Avoid dropping loop body profile counter to 0 because of zero count
8274          in loop's preheader.  */
8275       if (!(freq_e == profile_count::zero ()))
8276         freq_e = freq_e.force_nonzero ();
8277       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8278       scale_loop_frequencies (loop, p);
8279     }
8280
8281   edge exit_e = single_exit (loop);
8282   exit_e->probability = profile_probability::always ()
8283                                  .apply_scale (1, new_est_niter + 1);
8284
8285   edge exit_l = single_pred_edge (loop->latch);
8286   profile_probability prob = exit_l->probability;
8287   exit_l->probability = exit_e->probability.invert ();
8288   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8289     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8290 }
8291
8292 /* Function vect_transform_loop.
8293
8294    The analysis phase has determined that the loop is vectorizable.
8295    Vectorize the loop - created vectorized stmts to replace the scalar
8296    stmts in the loop, and update the loop exit condition.
8297    Returns scalar epilogue loop if any.  */
8298
8299 struct loop *
8300 vect_transform_loop (loop_vec_info loop_vinfo)
8301 {
8302   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8303   struct loop *epilogue = NULL;
8304   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8305   int nbbs = loop->num_nodes;
8306   int i;
8307   tree niters_vector = NULL_TREE;
8308   tree step_vector = NULL_TREE;
8309   tree niters_vector_mult_vf = NULL_TREE;
8310   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8311   unsigned int lowest_vf = constant_lower_bound (vf);
8312   bool grouped_store;
8313   bool slp_scheduled = false;
8314   gimple *stmt, *pattern_stmt;
8315   gimple_seq pattern_def_seq = NULL;
8316   gimple_stmt_iterator pattern_def_si = gsi_none ();
8317   bool transform_pattern_stmt = false;
8318   bool check_profitability = false;
8319   unsigned int th;
8320
8321   if (dump_enabled_p ())
8322     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8323
8324   /* Use the more conservative vectorization threshold.  If the number
8325      of iterations is constant assume the cost check has been performed
8326      by our caller.  If the threshold makes all loops profitable that
8327      run at least the (estimated) vectorization factor number of times
8328      checking is pointless, too.  */
8329   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8330   if (th >= vect_vf_for_cost (loop_vinfo)
8331       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8332     {
8333       if (dump_enabled_p ())
8334         dump_printf_loc (MSG_NOTE, vect_location,
8335                          "Profitability threshold is %d loop iterations.\n",
8336                          th);
8337       check_profitability = true;
8338     }
8339
8340   /* Make sure there exists a single-predecessor exit bb.  Do this before
8341      versioning.   */
8342   edge e = single_exit (loop);
8343   if (! single_pred_p (e->dest))
8344     {
8345       split_loop_exit_edge (e);
8346       if (dump_enabled_p ())
8347         dump_printf (MSG_NOTE, "split exit edge\n");
8348     }
8349
8350   /* Version the loop first, if required, so the profitability check
8351      comes first.  */
8352
8353   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8354     {
8355       poly_uint64 versioning_threshold
8356         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8357       if (check_profitability
8358           && ordered_p (poly_uint64 (th), versioning_threshold))
8359         {
8360           versioning_threshold = ordered_max (poly_uint64 (th),
8361                                               versioning_threshold);
8362           check_profitability = false;
8363         }
8364       vect_loop_versioning (loop_vinfo, th, check_profitability,
8365                             versioning_threshold);
8366       check_profitability = false;
8367     }
8368
8369   /* Make sure there exists a single-predecessor exit bb also on the
8370      scalar loop copy.  Do this after versioning but before peeling
8371      so CFG structure is fine for both scalar and if-converted loop
8372      to make slpeel_duplicate_current_defs_from_edges face matched
8373      loop closed PHI nodes on the exit.  */
8374   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8375     {
8376       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8377       if (! single_pred_p (e->dest))
8378         {
8379           split_loop_exit_edge (e);
8380           if (dump_enabled_p ())
8381             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8382         }
8383     }
8384
8385   tree niters = vect_build_loop_niters (loop_vinfo);
8386   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8387   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8388   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8389   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8390                               &step_vector, &niters_vector_mult_vf, th,
8391                               check_profitability, niters_no_overflow);
8392
8393   if (niters_vector == NULL_TREE)
8394     {
8395       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8396           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8397           && known_eq (lowest_vf, vf))
8398         {
8399           niters_vector
8400             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8401                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8402           step_vector = build_one_cst (TREE_TYPE (niters));
8403         }
8404       else
8405         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8406                                      &step_vector, niters_no_overflow);
8407     }
8408
8409   /* 1) Make sure the loop header has exactly two entries
8410      2) Make sure we have a preheader basic block.  */
8411
8412   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8413
8414   split_edge (loop_preheader_edge (loop));
8415
8416   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8417       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8418     /* This will deal with any possible peeling.  */
8419     vect_prepare_for_masked_peels (loop_vinfo);
8420
8421   /* FORNOW: the vectorizer supports only loops which body consist
8422      of one basic block (header + empty latch). When the vectorizer will
8423      support more involved loop forms, the order by which the BBs are
8424      traversed need to be reconsidered.  */
8425
8426   for (i = 0; i < nbbs; i++)
8427     {
8428       basic_block bb = bbs[i];
8429       stmt_vec_info stmt_info;
8430
8431       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8432            gsi_next (&si))
8433         {
8434           gphi *phi = si.phi ();
8435           if (dump_enabled_p ())
8436             {
8437               dump_printf_loc (MSG_NOTE, vect_location,
8438                                "------>vectorizing phi: ");
8439               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8440             }
8441           stmt_info = vinfo_for_stmt (phi);
8442           if (!stmt_info)
8443             continue;
8444
8445           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8446             vect_loop_kill_debug_uses (loop, phi);
8447
8448           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8449               && !STMT_VINFO_LIVE_P (stmt_info))
8450             continue;
8451
8452           if (STMT_VINFO_VECTYPE (stmt_info)
8453               && (maybe_ne
8454                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8455               && dump_enabled_p ())
8456             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8457
8458           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8459                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8460                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8461               && ! PURE_SLP_STMT (stmt_info))
8462             {
8463               if (dump_enabled_p ())
8464                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8465               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8466             }
8467         }
8468
8469       pattern_stmt = NULL;
8470       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8471            !gsi_end_p (si) || transform_pattern_stmt;)
8472         {
8473           bool is_store;
8474
8475           if (transform_pattern_stmt)
8476             stmt = pattern_stmt;
8477           else
8478             {
8479               stmt = gsi_stmt (si);
8480               /* During vectorization remove existing clobber stmts.  */
8481               if (gimple_clobber_p (stmt))
8482                 {
8483                   unlink_stmt_vdef (stmt);
8484                   gsi_remove (&si, true);
8485                   release_defs (stmt);
8486                   continue;
8487                 }
8488             }
8489
8490           if (dump_enabled_p ())
8491             {
8492               dump_printf_loc (MSG_NOTE, vect_location,
8493                                "------>vectorizing statement: ");
8494               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8495             }
8496
8497           stmt_info = vinfo_for_stmt (stmt);
8498
8499           /* vector stmts created in the outer-loop during vectorization of
8500              stmts in an inner-loop may not have a stmt_info, and do not
8501              need to be vectorized.  */
8502           if (!stmt_info)
8503             {
8504               gsi_next (&si);
8505               continue;
8506             }
8507
8508           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8509             vect_loop_kill_debug_uses (loop, stmt);
8510
8511           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8512               && !STMT_VINFO_LIVE_P (stmt_info))
8513             {
8514               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8515                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8516                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8517                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8518                 {
8519                   stmt = pattern_stmt;
8520                   stmt_info = vinfo_for_stmt (stmt);
8521                 }
8522               else
8523                 {
8524                   gsi_next (&si);
8525                   continue;
8526                 }
8527             }
8528           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8529                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8530                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8531                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8532             transform_pattern_stmt = true;
8533
8534           /* If pattern statement has def stmts, vectorize them too.  */
8535           if (is_pattern_stmt_p (stmt_info))
8536             {
8537               if (pattern_def_seq == NULL)
8538                 {
8539                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8540                   pattern_def_si = gsi_start (pattern_def_seq);
8541                 }
8542               else if (!gsi_end_p (pattern_def_si))
8543                 gsi_next (&pattern_def_si);
8544               if (pattern_def_seq != NULL)
8545                 {
8546                   gimple *pattern_def_stmt = NULL;
8547                   stmt_vec_info pattern_def_stmt_info = NULL;
8548
8549                   while (!gsi_end_p (pattern_def_si))
8550                     {
8551                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8552                       pattern_def_stmt_info
8553                         = vinfo_for_stmt (pattern_def_stmt);
8554                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8555                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8556                         break;
8557                       gsi_next (&pattern_def_si);
8558                     }
8559
8560                   if (!gsi_end_p (pattern_def_si))
8561                     {
8562                       if (dump_enabled_p ())
8563                         {
8564                           dump_printf_loc (MSG_NOTE, vect_location,
8565                                            "==> vectorizing pattern def "
8566                                            "stmt: ");
8567                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8568                                             pattern_def_stmt, 0);
8569                         }
8570
8571                       stmt = pattern_def_stmt;
8572                       stmt_info = pattern_def_stmt_info;
8573                     }
8574                   else
8575                     {
8576                       pattern_def_si = gsi_none ();
8577                       transform_pattern_stmt = false;
8578                     }
8579                 }
8580               else
8581                 transform_pattern_stmt = false;
8582             }
8583
8584           if (STMT_VINFO_VECTYPE (stmt_info))
8585             {
8586               poly_uint64 nunits
8587                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8588               if (!STMT_SLP_TYPE (stmt_info)
8589                   && maybe_ne (nunits, vf)
8590                   && dump_enabled_p ())
8591                   /* For SLP VF is set according to unrolling factor, and not
8592                      to vector size, hence for SLP this print is not valid.  */
8593                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8594             }
8595
8596           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8597              reached.  */
8598           if (STMT_SLP_TYPE (stmt_info))
8599             {
8600               if (!slp_scheduled)
8601                 {
8602                   slp_scheduled = true;
8603
8604                   if (dump_enabled_p ())
8605                     dump_printf_loc (MSG_NOTE, vect_location,
8606                                      "=== scheduling SLP instances ===\n");
8607
8608                   vect_schedule_slp (loop_vinfo);
8609                 }
8610
8611               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8612               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8613                 {
8614                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8615                     {
8616                       pattern_def_seq = NULL;
8617                       gsi_next (&si);
8618                     }
8619                   continue;
8620                 }
8621             }
8622
8623           /* -------- vectorize statement ------------ */
8624           if (dump_enabled_p ())
8625             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8626
8627           grouped_store = false;
8628           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8629           if (is_store)
8630             {
8631               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8632                 {
8633                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8634                      interleaving chain was completed - free all the stores in
8635                      the chain.  */
8636                   gsi_next (&si);
8637                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8638                 }
8639               else
8640                 {
8641                   /* Free the attached stmt_vec_info and remove the stmt.  */
8642                   gimple *store = gsi_stmt (si);
8643                   free_stmt_vec_info (store);
8644                   unlink_stmt_vdef (store);
8645                   gsi_remove (&si, true);
8646                   release_defs (store);
8647                 }
8648
8649               /* Stores can only appear at the end of pattern statements.  */
8650               gcc_assert (!transform_pattern_stmt);
8651               pattern_def_seq = NULL;
8652             }
8653           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8654             {
8655               pattern_def_seq = NULL;
8656               gsi_next (&si);
8657             }
8658         }                       /* stmts in BB */
8659
8660       /* Stub out scalar statements that must not survive vectorization.
8661          Doing this here helps with grouped statements, or statements that
8662          are involved in patterns.  */
8663       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8664            !gsi_end_p (gsi); gsi_next (&gsi))
8665         {
8666           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8667           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8668             {
8669               tree lhs = gimple_get_lhs (call);
8670               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8671                 {
8672                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8673                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8674                   gsi_replace (&gsi, new_stmt, true);
8675                 }
8676             }
8677         }
8678     }                           /* BBs in loop */
8679
8680   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8681      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8682   if (integer_onep (step_vector))
8683     niters_no_overflow = true;
8684   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8685                            niters_vector_mult_vf, !niters_no_overflow);
8686
8687   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8688   scale_profile_for_vect_loop (loop, assumed_vf);
8689
8690   /* True if the final iteration might not handle a full vector's
8691      worth of scalar iterations.  */
8692   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8693   /* The minimum number of iterations performed by the epilogue.  This
8694      is 1 when peeling for gaps because we always need a final scalar
8695      iteration.  */
8696   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8697   /* +1 to convert latch counts to loop iteration counts,
8698      -min_epilogue_iters to remove iterations that cannot be performed
8699        by the vector code.  */
8700   int bias_for_lowest = 1 - min_epilogue_iters;
8701   int bias_for_assumed = bias_for_lowest;
8702   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8703   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8704     {
8705       /* When the amount of peeling is known at compile time, the first
8706          iteration will have exactly alignment_npeels active elements.
8707          In the worst case it will have at least one.  */
8708       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8709       bias_for_lowest += lowest_vf - min_first_active;
8710       bias_for_assumed += assumed_vf - min_first_active;
8711     }
8712   /* In these calculations the "- 1" converts loop iteration counts
8713      back to latch counts.  */
8714   if (loop->any_upper_bound)
8715     loop->nb_iterations_upper_bound
8716       = (final_iter_may_be_partial
8717          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8718                           lowest_vf) - 1
8719          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8720                            lowest_vf) - 1);
8721   if (loop->any_likely_upper_bound)
8722     loop->nb_iterations_likely_upper_bound
8723       = (final_iter_may_be_partial
8724          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8725                           + bias_for_lowest, lowest_vf) - 1
8726          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8727                            + bias_for_lowest, lowest_vf) - 1);
8728   if (loop->any_estimate)
8729     loop->nb_iterations_estimate
8730       = (final_iter_may_be_partial
8731          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8732                           assumed_vf) - 1
8733          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8734                            assumed_vf) - 1);
8735
8736   if (dump_enabled_p ())
8737     {
8738       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8739         {
8740           dump_printf_loc (MSG_NOTE, vect_location,
8741                            "LOOP VECTORIZED\n");
8742           if (loop->inner)
8743             dump_printf_loc (MSG_NOTE, vect_location,
8744                              "OUTER LOOP VECTORIZED\n");
8745           dump_printf (MSG_NOTE, "\n");
8746         }
8747       else
8748         {
8749           dump_printf_loc (MSG_NOTE, vect_location,
8750                            "LOOP EPILOGUE VECTORIZED (VS=");
8751           dump_dec (MSG_NOTE, current_vector_size);
8752           dump_printf (MSG_NOTE, ")\n");
8753         }
8754     }
8755
8756   /* Free SLP instances here because otherwise stmt reference counting
8757      won't work.  */
8758   slp_instance instance;
8759   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8760     vect_free_slp_instance (instance);
8761   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8762   /* Clear-up safelen field since its value is invalid after vectorization
8763      since vectorized loop can have loop-carried dependencies.  */
8764   loop->safelen = 0;
8765
8766   /* Don't vectorize epilogue for epilogue.  */
8767   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8768     epilogue = NULL;
8769
8770   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8771     epilogue = NULL;
8772
8773   if (epilogue)
8774     {
8775       auto_vector_sizes vector_sizes;
8776       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8777       unsigned int next_size = 0;
8778
8779       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8780           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8781           && known_eq (vf, lowest_vf))
8782         {
8783           unsigned int eiters
8784             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8785                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8786           eiters = eiters % lowest_vf;
8787           epilogue->nb_iterations_upper_bound = eiters - 1;
8788
8789           unsigned int ratio;
8790           while (next_size < vector_sizes.length ()
8791                  && !(constant_multiple_p (current_vector_size,
8792                                            vector_sizes[next_size], &ratio)
8793                       && eiters >= lowest_vf / ratio))
8794             next_size += 1;
8795         }
8796       else
8797         while (next_size < vector_sizes.length ()
8798                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8799           next_size += 1;
8800
8801       if (next_size == vector_sizes.length ())
8802         epilogue = NULL;
8803     }
8804
8805   if (epilogue)
8806     {
8807       epilogue->force_vectorize = loop->force_vectorize;
8808       epilogue->safelen = loop->safelen;
8809       epilogue->dont_vectorize = false;
8810
8811       /* We may need to if-convert epilogue to vectorize it.  */
8812       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8813         tree_if_conversion (epilogue);
8814     }
8815
8816   return epilogue;
8817 }
8818
8819 /* The code below is trying to perform simple optimization - revert
8820    if-conversion for masked stores, i.e. if the mask of a store is zero
8821    do not perform it and all stored value producers also if possible.
8822    For example,
8823      for (i=0; i<n; i++)
8824        if (c[i])
8825         {
8826           p1[i] += 1;
8827           p2[i] = p3[i] +2;
8828         }
8829    this transformation will produce the following semi-hammock:
8830
8831    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8832      {
8833        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8834        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8835        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8836        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8837        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8838        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8839      }
8840 */
8841
8842 void
8843 optimize_mask_stores (struct loop *loop)
8844 {
8845   basic_block *bbs = get_loop_body (loop);
8846   unsigned nbbs = loop->num_nodes;
8847   unsigned i;
8848   basic_block bb;
8849   struct loop *bb_loop;
8850   gimple_stmt_iterator gsi;
8851   gimple *stmt;
8852   auto_vec<gimple *> worklist;
8853
8854   vect_location = find_loop_location (loop);
8855   /* Pick up all masked stores in loop if any.  */
8856   for (i = 0; i < nbbs; i++)
8857     {
8858       bb = bbs[i];
8859       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8860            gsi_next (&gsi))
8861         {
8862           stmt = gsi_stmt (gsi);
8863           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8864             worklist.safe_push (stmt);
8865         }
8866     }
8867
8868   free (bbs);
8869   if (worklist.is_empty ())
8870     return;
8871
8872   /* Loop has masked stores.  */
8873   while (!worklist.is_empty ())
8874     {
8875       gimple *last, *last_store;
8876       edge e, efalse;
8877       tree mask;
8878       basic_block store_bb, join_bb;
8879       gimple_stmt_iterator gsi_to;
8880       tree vdef, new_vdef;
8881       gphi *phi;
8882       tree vectype;
8883       tree zero;
8884
8885       last = worklist.pop ();
8886       mask = gimple_call_arg (last, 2);
8887       bb = gimple_bb (last);
8888       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8889          the same loop as if_bb.  It could be different to LOOP when two
8890          level loop-nest is vectorized and mask_store belongs to the inner
8891          one.  */
8892       e = split_block (bb, last);
8893       bb_loop = bb->loop_father;
8894       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8895       join_bb = e->dest;
8896       store_bb = create_empty_bb (bb);
8897       add_bb_to_loop (store_bb, bb_loop);
8898       e->flags = EDGE_TRUE_VALUE;
8899       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8900       /* Put STORE_BB to likely part.  */
8901       efalse->probability = profile_probability::unlikely ();
8902       store_bb->count = efalse->count ();
8903       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8904       if (dom_info_available_p (CDI_DOMINATORS))
8905         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8906       if (dump_enabled_p ())
8907         dump_printf_loc (MSG_NOTE, vect_location,
8908                          "Create new block %d to sink mask stores.",
8909                          store_bb->index);
8910       /* Create vector comparison with boolean result.  */
8911       vectype = TREE_TYPE (mask);
8912       zero = build_zero_cst (vectype);
8913       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8914       gsi = gsi_last_bb (bb);
8915       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8916       /* Create new PHI node for vdef of the last masked store:
8917          .MEM_2 = VDEF <.MEM_1>
8918          will be converted to
8919          .MEM.3 = VDEF <.MEM_1>
8920          and new PHI node will be created in join bb
8921          .MEM_2 = PHI <.MEM_1, .MEM_3>
8922       */
8923       vdef = gimple_vdef (last);
8924       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8925       gimple_set_vdef (last, new_vdef);
8926       phi = create_phi_node (vdef, join_bb);
8927       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8928
8929       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8930       while (true)
8931         {
8932           gimple_stmt_iterator gsi_from;
8933           gimple *stmt1 = NULL;
8934
8935           /* Move masked store to STORE_BB.  */
8936           last_store = last;
8937           gsi = gsi_for_stmt (last);
8938           gsi_from = gsi;
8939           /* Shift GSI to the previous stmt for further traversal.  */
8940           gsi_prev (&gsi);
8941           gsi_to = gsi_start_bb (store_bb);
8942           gsi_move_before (&gsi_from, &gsi_to);
8943           /* Setup GSI_TO to the non-empty block start.  */
8944           gsi_to = gsi_start_bb (store_bb);
8945           if (dump_enabled_p ())
8946             {
8947               dump_printf_loc (MSG_NOTE, vect_location,
8948                                "Move stmt to created bb\n");
8949               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8950             }
8951           /* Move all stored value producers if possible.  */
8952           while (!gsi_end_p (gsi))
8953             {
8954               tree lhs;
8955               imm_use_iterator imm_iter;
8956               use_operand_p use_p;
8957               bool res;
8958
8959               /* Skip debug statements.  */
8960               if (is_gimple_debug (gsi_stmt (gsi)))
8961                 {
8962                   gsi_prev (&gsi);
8963                   continue;
8964                 }
8965               stmt1 = gsi_stmt (gsi);
8966               /* Do not consider statements writing to memory or having
8967                  volatile operand.  */
8968               if (gimple_vdef (stmt1)
8969                   || gimple_has_volatile_ops (stmt1))
8970                 break;
8971               gsi_from = gsi;
8972               gsi_prev (&gsi);
8973               lhs = gimple_get_lhs (stmt1);
8974               if (!lhs)
8975                 break;
8976
8977               /* LHS of vectorized stmt must be SSA_NAME.  */
8978               if (TREE_CODE (lhs) != SSA_NAME)
8979                 break;
8980
8981               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8982                 {
8983                   /* Remove dead scalar statement.  */
8984                   if (has_zero_uses (lhs))
8985                     {
8986                       gsi_remove (&gsi_from, true);
8987                       continue;
8988                     }
8989                 }
8990
8991               /* Check that LHS does not have uses outside of STORE_BB.  */
8992               res = true;
8993               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8994                 {
8995                   gimple *use_stmt;
8996                   use_stmt = USE_STMT (use_p);
8997                   if (is_gimple_debug (use_stmt))
8998                     continue;
8999                   if (gimple_bb (use_stmt) != store_bb)
9000                     {
9001                       res = false;
9002                       break;
9003                     }
9004                 }
9005               if (!res)
9006                 break;
9007
9008               if (gimple_vuse (stmt1)
9009                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9010                 break;
9011
9012               /* Can move STMT1 to STORE_BB.  */
9013               if (dump_enabled_p ())
9014                 {
9015                   dump_printf_loc (MSG_NOTE, vect_location,
9016                                    "Move stmt to created bb\n");
9017                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9018                 }
9019               gsi_move_before (&gsi_from, &gsi_to);
9020               /* Shift GSI_TO for further insertion.  */
9021               gsi_prev (&gsi_to);
9022             }
9023           /* Put other masked stores with the same mask to STORE_BB.  */
9024           if (worklist.is_empty ()
9025               || gimple_call_arg (worklist.last (), 2) != mask
9026               || worklist.last () != stmt1)
9027             break;
9028           last = worklist.pop ();
9029         }
9030       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9031     }
9032 }