gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   if (dump_enabled_p ())
 217     {
 218       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 219       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 220     }
 221   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 222     return false;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 228
 229       /* If a pattern statement has def stmts, analyze them too.  */
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             {
 237               dump_printf_loc (MSG_NOTE, vect_location,
 238                                "==> examining pattern def stmt: ");
 239               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 240                                 def_stmt_info->stmt, 0);
 241             }
 242           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 243                                              vf, mask_producers))
 244             return false;
 245         }
 246
 247       if (dump_enabled_p ())
 248         {
 249           dump_printf_loc (MSG_NOTE, vect_location,
 250                            "==> examining pattern statement: ");
 251           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 252         }
 253       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 254         return false;
 255     }
 256
 257   return true;
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static bool
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297   auto_vec<stmt_vec_info> mask_producers;
 298
 299   if (dump_enabled_p ())
 300     dump_printf_loc (MSG_NOTE, vect_location,
 301                      "=== vect_determine_vectorization_factor ===\n");
 302
 303   for (i = 0; i < nbbs; i++)
 304     {
 305       basic_block bb = bbs[i];
 306
 307       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 308            gsi_next (&si))
 309         {
 310           phi = si.phi ();
 311           stmt_info = vinfo_for_stmt (phi);
 312           if (dump_enabled_p ())
 313             {
 314               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 315               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 316             }
 317
 318           gcc_assert (stmt_info);
 319
 320           if (STMT_VINFO_RELEVANT_P (stmt_info)
 321               || STMT_VINFO_LIVE_P (stmt_info))
 322             {
 323               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 324               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 325
 326               if (dump_enabled_p ())
 327                 {
 328                   dump_printf_loc (MSG_NOTE, vect_location,
 329                                    "get vectype for scalar type:  ");
 330                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 331                   dump_printf (MSG_NOTE, "\n");
 332                 }
 333
 334               vectype = get_vectype_for_scalar_type (scalar_type);
 335               if (!vectype)
 336                 {
 337                   if (dump_enabled_p ())
 338                     {
 339                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 340                                        "not vectorized: unsupported "
 341                                        "data-type ");
 342                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 343                                          scalar_type);
 344                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 345                     }
 346                   return false;
 347                 }
 348               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 349
 350               if (dump_enabled_p ())
 351                 {
 352                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 353                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 354                   dump_printf (MSG_NOTE, "\n");
 355                 }
 356
 357               if (dump_enabled_p ())
 358                 {
 359                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 360                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 361                   dump_printf (MSG_NOTE, "\n");
 362                 }
 363
 364               vect_update_max_nunits (&vectorization_factor, vectype);
 365             }
 366         }
 367
 368       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 369            gsi_next (&si))
 370         {
 371           stmt_info = vinfo_for_stmt (gsi_stmt (si));
 372           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 373                                            &mask_producers))
 374             return false;
 375         }
 376     }
 377
 378   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 379   if (dump_enabled_p ())
 380     {
 381       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 382       dump_dec (MSG_NOTE, vectorization_factor);
 383       dump_printf (MSG_NOTE, "\n");
 384     }
 385
 386   if (known_le (vectorization_factor, 1U))
 387     {
 388       if (dump_enabled_p ())
 389         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 390                          "not vectorized: unsupported data-type\n");
 391       return false;
 392     }
 393   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 394
 395   for (i = 0; i < mask_producers.length (); i++)
 396     {
 397       stmt_info = mask_producers[i];
 398       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 399       if (!mask_type)
 400         return false;
 401       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 402     }
 403
 404   return true;
 405 }
 406
 407
 408 /* Function vect_is_simple_iv_evolution.
 409
 410    FORNOW: A simple evolution of an induction variables in the loop is
 411    considered a polynomial evolution.  */
 412
 413 static bool
 414 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 415                              tree * step)
 416 {
 417   tree init_expr;
 418   tree step_expr;
 419   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 420   basic_block bb;
 421
 422   /* When there is no evolution in this loop, the evolution function
 423      is not "simple".  */
 424   if (evolution_part == NULL_TREE)
 425     return false;
 426
 427   /* When the evolution is a polynomial of degree >= 2
 428      the evolution function is not "simple".  */
 429   if (tree_is_chrec (evolution_part))
 430     return false;
 431
 432   step_expr = evolution_part;
 433   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 434
 435   if (dump_enabled_p ())
 436     {
 437       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 438       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 439       dump_printf (MSG_NOTE, ",  init: ");
 440       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 441       dump_printf (MSG_NOTE, "\n");
 442     }
 443
 444   *init = init_expr;
 445   *step = step_expr;
 446
 447   if (TREE_CODE (step_expr) != INTEGER_CST
 448       && (TREE_CODE (step_expr) != SSA_NAME
 449           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 450               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 451           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 452               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 453                   || !flag_associative_math)))
 454       && (TREE_CODE (step_expr) != REAL_CST
 455           || !flag_associative_math))
 456     {
 457       if (dump_enabled_p ())
 458         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 459                          "step unknown.\n");
 460       return false;
 461     }
 462
 463   return true;
 464 }
 465
 466 /* Function vect_analyze_scalar_cycles_1.
 467
 468    Examine the cross iteration def-use cycles of scalar variables
 469    in LOOP.  LOOP_VINFO represents the loop that is now being
 470    considered for vectorization (can be LOOP, or an outer-loop
 471    enclosing LOOP).  */
 472
 473 static void
 474 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 475 {
 476   basic_block bb = loop->header;
 477   tree init, step;
 478   auto_vec<gimple *, 64> worklist;
 479   gphi_iterator gsi;
 480   bool double_reduc;
 481
 482   if (dump_enabled_p ())
 483     dump_printf_loc (MSG_NOTE, vect_location,
 484                      "=== vect_analyze_scalar_cycles ===\n");
 485
 486   /* First - identify all inductions.  Reduction detection assumes that all the
 487      inductions have been identified, therefore, this order must not be
 488      changed.  */
 489   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 490     {
 491       gphi *phi = gsi.phi ();
 492       tree access_fn = NULL;
 493       tree def = PHI_RESULT (phi);
 494       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 495
 496       if (dump_enabled_p ())
 497         {
 498           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 499           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 500         }
 501
 502       /* Skip virtual phi's.  The data dependences that are associated with
 503          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 504       if (virtual_operand_p (def))
 505         continue;
 506
 507       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 508
 509       /* Analyze the evolution function.  */
 510       access_fn = analyze_scalar_evolution (loop, def);
 511       if (access_fn)
 512         {
 513           STRIP_NOPS (access_fn);
 514           if (dump_enabled_p ())
 515             {
 516               dump_printf_loc (MSG_NOTE, vect_location,
 517                                "Access function of PHI: ");
 518               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 519               dump_printf (MSG_NOTE, "\n");
 520             }
 521           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522             = initial_condition_in_loop_num (access_fn, loop->num);
 523           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 524             = evolution_part_in_loop_num (access_fn, loop->num);
 525         }
 526
 527       if (!access_fn
 528           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 529           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 530               && TREE_CODE (step) != INTEGER_CST))
 531         {
 532           worklist.safe_push (phi);
 533           continue;
 534         }
 535
 536       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 537                   != NULL_TREE);
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 539
 540       if (dump_enabled_p ())
 541         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 542       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 543     }
 544
 545
 546   /* Second - identify all reductions and nested cycles.  */
 547   while (worklist.length () > 0)
 548     {
 549       gimple *phi = worklist.pop ();
 550       tree def = PHI_RESULT (phi);
 551       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 552       gimple *reduc_stmt;
 553
 554       if (dump_enabled_p ())
 555         {
 556           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 557           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 558         }
 559
 560       gcc_assert (!virtual_operand_p (def)
 561                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 562
 563       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 564                                                 &double_reduc, false);
 565       if (reduc_stmt)
 566         {
 567           if (double_reduc)
 568             {
 569               if (dump_enabled_p ())
 570                 dump_printf_loc (MSG_NOTE, vect_location,
 571                                  "Detected double reduction.\n");
 572
 573               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 574               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 575                                                     vect_double_reduction_def;
 576             }
 577           else
 578             {
 579               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 580                 {
 581                   if (dump_enabled_p ())
 582                     dump_printf_loc (MSG_NOTE, vect_location,
 583                                      "Detected vectorizable nested cycle.\n");
 584
 585                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 586                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 587                                                              vect_nested_cycle;
 588                 }
 589               else
 590                 {
 591                   if (dump_enabled_p ())
 592                     dump_printf_loc (MSG_NOTE, vect_location,
 593                                      "Detected reduction.\n");
 594
 595                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 596                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 597                                                            vect_reduction_def;
 598                   /* Store the reduction cycles for possible vectorization in
 599                      loop-aware SLP if it was not detected as reduction
 600                      chain.  */
 601                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 602                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 603                 }
 604             }
 605         }
 606       else
 607         if (dump_enabled_p ())
 608           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 609                            "Unknown def-use cycle pattern.\n");
 610     }
 611 }
 612
 613
 614 /* Function vect_analyze_scalar_cycles.
 615
 616    Examine the cross iteration def-use cycles of scalar variables, by
 617    analyzing the loop-header PHIs of scalar variables.  Classify each
 618    cycle as one of the following: invariant, induction, reduction, unknown.
 619    We do that for the loop represented by LOOP_VINFO, and also to its
 620    inner-loop, if exists.
 621    Examples for scalar cycles:
 622
 623    Example1: reduction:
 624
 625               loop1:
 626               for (i=0; i<N; i++)
 627                  sum += a[i];
 628
 629    Example2: induction:
 630
 631               loop2:
 632               for (i=0; i<N; i++)
 633                  a[i] = i;  */
 634
 635 static void
 636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 637 {
 638   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 639
 640   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 641
 642   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 643      Reductions in such inner-loop therefore have different properties than
 644      the reductions in the nest that gets vectorized:
 645      1. When vectorized, they are executed in the same order as in the original
 646         scalar loop, so we can't change the order of computation when
 647         vectorizing them.
 648      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 649         current checks are too strict.  */
 650
 651   if (loop->inner)
 652     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 653 }
 654
 655 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 656
 657 static void
 658 vect_fixup_reduc_chain (gimple *stmt)
 659 {
 660   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 661   gimple *stmtp;
 662   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 663               && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 664   REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
 665     = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
 666   do
 667     {
 668       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 669       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 670       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 671       if (stmt)
 672         REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 673           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 674     }
 675   while (stmt);
 676   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 677 }
 678
 679 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 680
 681 static void
 682 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 683 {
 684   gimple *first;
 685   unsigned i;
 686
 687   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 688     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 689       {
 690         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 691         while (next)
 692           {
 693             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 694               break;
 695             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 696           }
 697         /* If not all stmt in the chain are patterns try to handle
 698            the chain without patterns.  */
 699         if (! next)
 700           {
 701             vect_fixup_reduc_chain (first);
 702             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 703               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 704           }
 705       }
 706 }
 707
 708 /* Function vect_get_loop_niters.
 709
 710    Determine how many iterations the loop is executed and place it
 711    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 712    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 713    niter information holds in ASSUMPTIONS.
 714
 715    Return the loop exit condition.  */
 716
 717
 718 static gcond *
 719 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 720                       tree *number_of_iterations, tree *number_of_iterationsm1)
 721 {
 722   edge exit = single_exit (loop);
 723   struct tree_niter_desc niter_desc;
 724   tree niter_assumptions, niter, may_be_zero;
 725   gcond *cond = get_loop_exit_condition (loop);
 726
 727   *assumptions = boolean_true_node;
 728   *number_of_iterationsm1 = chrec_dont_know;
 729   *number_of_iterations = chrec_dont_know;
 730   if (dump_enabled_p ())
 731     dump_printf_loc (MSG_NOTE, vect_location,
 732                      "=== get_loop_niters ===\n");
 733
 734   if (!exit)
 735     return cond;
 736
 737   niter = chrec_dont_know;
 738   may_be_zero = NULL_TREE;
 739   niter_assumptions = boolean_true_node;
 740   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 741       || chrec_contains_undetermined (niter_desc.niter))
 742     return cond;
 743
 744   niter_assumptions = niter_desc.assumptions;
 745   may_be_zero = niter_desc.may_be_zero;
 746   niter = niter_desc.niter;
 747
 748   if (may_be_zero && integer_zerop (may_be_zero))
 749     may_be_zero = NULL_TREE;
 750
 751   if (may_be_zero)
 752     {
 753       if (COMPARISON_CLASS_P (may_be_zero))
 754         {
 755           /* Try to combine may_be_zero with assumptions, this can simplify
 756              computation of niter expression.  */
 757           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 758             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 759                                              niter_assumptions,
 760                                              fold_build1 (TRUTH_NOT_EXPR,
 761                                                           boolean_type_node,
 762                                                           may_be_zero));
 763           else
 764             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 765                                  build_int_cst (TREE_TYPE (niter), 0),
 766                                  rewrite_to_non_trapping_overflow (niter));
 767
 768           may_be_zero = NULL_TREE;
 769         }
 770       else if (integer_nonzerop (may_be_zero))
 771         {
 772           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 773           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 774           return cond;
 775         }
 776       else
 777         return cond;
 778     }
 779
 780   *assumptions = niter_assumptions;
 781   *number_of_iterationsm1 = niter;
 782
 783   /* We want the number of loop header executions which is the number
 784      of latch executions plus one.
 785      ???  For UINT_MAX latch executions this number overflows to zero
 786      for loops like do { n++; } while (n != 0);  */
 787   if (niter && !chrec_contains_undetermined (niter))
 788     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 789                           build_int_cst (TREE_TYPE (niter), 1));
 790   *number_of_iterations = niter;
 791
 792   return cond;
 793 }
 794
 795 /* Function bb_in_loop_p
 796
 797    Used as predicate for dfs order traversal of the loop bbs.  */
 798
 799 static bool
 800 bb_in_loop_p (const_basic_block bb, const void *data)
 801 {
 802   const struct loop *const loop = (const struct loop *)data;
 803   if (flow_bb_inside_loop_p (loop, bb))
 804     return true;
 805   return false;
 806 }
 807
 808
 809 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 810    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 811
 812 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
 813   : vec_info (vec_info::loop, init_cost (loop_in)),
 814     loop (loop_in),
 815     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 816     num_itersm1 (NULL_TREE),
 817     num_iters (NULL_TREE),
 818     num_iters_unchanged (NULL_TREE),
 819     num_iters_assumptions (NULL_TREE),
 820     th (0),
 821     versioning_threshold (0),
 822     vectorization_factor (0),
 823     max_vectorization_factor (0),
 824     mask_skip_niters (NULL_TREE),
 825     mask_compare_type (NULL_TREE),
 826     unaligned_dr (NULL),
 827     peeling_for_alignment (0),
 828     ptr_mask (0),
 829     ivexpr_map (NULL),
 830     slp_unrolling_factor (1),
 831     single_scalar_iteration_cost (0),
 832     vectorizable (false),
 833     can_fully_mask_p (true),
 834     fully_masked_p (false),
 835     peeling_for_gaps (false),
 836     peeling_for_niter (false),
 837     operands_swapped (false),
 838     no_data_dependencies (false),
 839     has_mask_store (false),
 840     scalar_loop (NULL),
 841     orig_loop_info (NULL)
 842 {
 843   /* Create/Update stmt_info for all stmts in the loop.  */
 844   basic_block *body = get_loop_body (loop);
 845   for (unsigned int i = 0; i < loop->num_nodes; i++)
 846     {
 847       basic_block bb = body[i];
 848       gimple_stmt_iterator si;
 849
 850       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 851         {
 852           gimple *phi = gsi_stmt (si);
 853           gimple_set_uid (phi, 0);
 854           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
 855         }
 856
 857       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 858         {
 859           gimple *stmt = gsi_stmt (si);
 860           gimple_set_uid (stmt, 0);
 861           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
 862         }
 863     }
 864   free (body);
 865
 866   /* CHECKME: We want to visit all BBs before their successors (except for
 867      latch blocks, for which this assertion wouldn't hold).  In the simple
 868      case of the loop forms we allow, a dfs order of the BBs would the same
 869      as reversed postorder traversal, so we are safe.  */
 870
 871   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 872                                           bbs, loop->num_nodes, loop);
 873   gcc_assert (nbbs == loop->num_nodes);
 874 }
 875
 876 /* Free all levels of MASKS.  */
 877
 878 void
 879 release_vec_loop_masks (vec_loop_masks *masks)
 880 {
 881   rgroup_masks *rgm;
 882   unsigned int i;
 883   FOR_EACH_VEC_ELT (*masks, i, rgm)
 884     rgm->masks.release ();
 885   masks->release ();
 886 }
 887
 888 /* Free all memory used by the _loop_vec_info, as well as all the
 889    stmt_vec_info structs of all the stmts in the loop.  */
 890
 891 _loop_vec_info::~_loop_vec_info ()
 892 {
 893   int nbbs;
 894   gimple_stmt_iterator si;
 895   int j;
 896
 897   nbbs = loop->num_nodes;
 898   for (j = 0; j < nbbs; j++)
 899     {
 900       basic_block bb = bbs[j];
 901       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 902         free_stmt_vec_info (gsi_stmt (si));
 903
 904       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 905         {
 906           gimple *stmt = gsi_stmt (si);
 907
 908           /* We may have broken canonical form by moving a constant
 909              into RHS1 of a commutative op.  Fix such occurrences.  */
 910           if (operands_swapped && is_gimple_assign (stmt))
 911             {
 912               enum tree_code code = gimple_assign_rhs_code (stmt);
 913
 914               if ((code == PLUS_EXPR
 915                    || code == POINTER_PLUS_EXPR
 916                    || code == MULT_EXPR)
 917                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 918                 swap_ssa_operands (stmt,
 919                                    gimple_assign_rhs1_ptr (stmt),
 920                                    gimple_assign_rhs2_ptr (stmt));
 921               else if (code == COND_EXPR
 922                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 923                 {
 924                   tree cond_expr = gimple_assign_rhs1 (stmt);
 925                   enum tree_code cond_code = TREE_CODE (cond_expr);
 926
 927                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 928                     {
 929                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 930                                                                   0));
 931                       cond_code = invert_tree_comparison (cond_code,
 932                                                           honor_nans);
 933                       if (cond_code != ERROR_MARK)
 934                         {
 935                           TREE_SET_CODE (cond_expr, cond_code);
 936                           swap_ssa_operands (stmt,
 937                                              gimple_assign_rhs2_ptr (stmt),
 938                                              gimple_assign_rhs3_ptr (stmt));
 939                         }
 940                     }
 941                 }
 942             }
 943
 944           /* Free stmt_vec_info.  */
 945           free_stmt_vec_info (stmt);
 946           gsi_next (&si);
 947         }
 948     }
 949
 950   free (bbs);
 951
 952   release_vec_loop_masks (&masks);
 953   delete ivexpr_map;
 954
 955   loop->aux = NULL;
 956 }
 957
 958 /* Return an invariant or register for EXPR and emit necessary
 959    computations in the LOOP_VINFO loop preheader.  */
 960
 961 tree
 962 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 963 {
 964   if (is_gimple_reg (expr)
 965       || is_gimple_min_invariant (expr))
 966     return expr;
 967
 968   if (! loop_vinfo->ivexpr_map)
 969     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 970   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 971   if (! cached)
 972     {
 973       gimple_seq stmts = NULL;
 974       cached = force_gimple_operand (unshare_expr (expr),
 975                                      &stmts, true, NULL_TREE);
 976       if (stmts)
 977         {
 978           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 979           gsi_insert_seq_on_edge_immediate (e, stmts);
 980         }
 981     }
 982   return cached;
 983 }
 984
 985 /* Return true if we can use CMP_TYPE as the comparison type to produce
 986    all masks required to mask LOOP_VINFO.  */
 987
 988 static bool
 989 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 990 {
 991   rgroup_masks *rgm;
 992   unsigned int i;
 993   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 994     if (rgm->mask_type != NULL_TREE
 995         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 996                                             cmp_type, rgm->mask_type,
 997                                             OPTIMIZE_FOR_SPEED))
 998       return false;
 999   return true;
1000 }
1001
1002 /* Calculate the maximum number of scalars per iteration for every
1003    rgroup in LOOP_VINFO.  */
1004
1005 static unsigned int
1006 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1007 {
1008   unsigned int res = 1;
1009   unsigned int i;
1010   rgroup_masks *rgm;
1011   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1012     res = MAX (res, rgm->max_nscalars_per_iter);
1013   return res;
1014 }
1015
1016 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1017    whether we can actually generate the masks required.  Return true if so,
1018    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1019
1020 static bool
1021 vect_verify_full_masking (loop_vec_info loop_vinfo)
1022 {
1023   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1024   unsigned int min_ni_width;
1025
1026   /* Use a normal loop if there are no statements that need masking.
1027      This only happens in rare degenerate cases: it means that the loop
1028      has no loads, no stores, and no live-out values.  */
1029   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1030     return false;
1031
1032   /* Get the maximum number of iterations that is representable
1033      in the counter type.  */
1034   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1035   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1036
1037   /* Get a more refined estimate for the number of iterations.  */
1038   widest_int max_back_edges;
1039   if (max_loop_iterations (loop, &max_back_edges))
1040     max_ni = wi::smin (max_ni, max_back_edges + 1);
1041
1042   /* Account for rgroup masks, in which each bit is replicated N times.  */
1043   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1044
1045   /* Work out how many bits we need to represent the limit.  */
1046   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1047
1048   /* Find a scalar mode for which WHILE_ULT is supported.  */
1049   opt_scalar_int_mode cmp_mode_iter;
1050   tree cmp_type = NULL_TREE;
1051   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1052     {
1053       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1054       if (cmp_bits >= min_ni_width
1055           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1056         {
1057           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1058           if (this_type
1059               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1060             {
1061               /* Although we could stop as soon as we find a valid mode,
1062                  it's often better to continue until we hit Pmode, since the
1063                  operands to the WHILE are more likely to be reusable in
1064                  address calculations.  */
1065               cmp_type = this_type;
1066               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1067                 break;
1068             }
1069         }
1070     }
1071
1072   if (!cmp_type)
1073     return false;
1074
1075   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1076   return true;
1077 }
1078
1079 /* Calculate the cost of one scalar iteration of the loop.  */
1080 static void
1081 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1082 {
1083   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1084   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1085   int nbbs = loop->num_nodes, factor;
1086   int innerloop_iters, i;
1087
1088   /* Gather costs for statements in the scalar loop.  */
1089
1090   /* FORNOW.  */
1091   innerloop_iters = 1;
1092   if (loop->inner)
1093     innerloop_iters = 50; /* FIXME */
1094
1095   for (i = 0; i < nbbs; i++)
1096     {
1097       gimple_stmt_iterator si;
1098       basic_block bb = bbs[i];
1099
1100       if (bb->loop_father == loop->inner)
1101         factor = innerloop_iters;
1102       else
1103         factor = 1;
1104
1105       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1106         {
1107           gimple *stmt = gsi_stmt (si);
1108           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1109
1110           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1111             continue;
1112
1113           /* Skip stmts that are not vectorized inside the loop.  */
1114           if (stmt_info
1115               && !STMT_VINFO_RELEVANT_P (stmt_info)
1116               && (!STMT_VINFO_LIVE_P (stmt_info)
1117                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1118               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1119             continue;
1120
1121           vect_cost_for_stmt kind;
1122           if (STMT_VINFO_DATA_REF (stmt_info))
1123             {
1124               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1125                kind = scalar_load;
1126              else
1127                kind = scalar_store;
1128             }
1129           else
1130             kind = scalar_stmt;
1131
1132           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1133                             factor, kind, stmt_info, 0, vect_prologue);
1134         }
1135     }
1136
1137   /* Now accumulate cost.  */
1138   void *target_cost_data = init_cost (loop);
1139   stmt_info_for_cost *si;
1140   int j;
1141   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1142                     j, si)
1143     {
1144       struct _stmt_vec_info *stmt_info
1145         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1146       (void) add_stmt_cost (target_cost_data, si->count,
1147                             si->kind, stmt_info, si->misalign,
1148                             vect_body);
1149     }
1150   unsigned dummy, body_cost = 0;
1151   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1152   destroy_cost_data (target_cost_data);
1153   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1154 }
1155
1156
1157 /* Function vect_analyze_loop_form_1.
1158
1159    Verify that certain CFG restrictions hold, including:
1160    - the loop has a pre-header
1161    - the loop has a single entry and exit
1162    - the loop exit condition is simple enough
1163    - the number of iterations can be analyzed, i.e, a countable loop.  The
1164      niter could be analyzed under some assumptions.  */
1165
1166 bool
1167 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1168                           tree *assumptions, tree *number_of_iterationsm1,
1169                           tree *number_of_iterations, gcond **inner_loop_cond)
1170 {
1171   if (dump_enabled_p ())
1172     dump_printf_loc (MSG_NOTE, vect_location,
1173                      "=== vect_analyze_loop_form ===\n");
1174
1175   /* Different restrictions apply when we are considering an inner-most loop,
1176      vs. an outer (nested) loop.
1177      (FORNOW. May want to relax some of these restrictions in the future).  */
1178
1179   if (!loop->inner)
1180     {
1181       /* Inner-most loop.  We currently require that the number of BBs is
1182          exactly 2 (the header and latch).  Vectorizable inner-most loops
1183          look like this:
1184
1185                         (pre-header)
1186                            |
1187                           header <--------+
1188                            | |            |
1189                            | +--> latch --+
1190                            |
1191                         (exit-bb)  */
1192
1193       if (loop->num_nodes != 2)
1194         {
1195           if (dump_enabled_p ())
1196             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197                              "not vectorized: control flow in loop.\n");
1198           return false;
1199         }
1200
1201       if (empty_block_p (loop->header))
1202         {
1203           if (dump_enabled_p ())
1204             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1205                              "not vectorized: empty loop.\n");
1206           return false;
1207         }
1208     }
1209   else
1210     {
1211       struct loop *innerloop = loop->inner;
1212       edge entryedge;
1213
1214       /* Nested loop. We currently require that the loop is doubly-nested,
1215          contains a single inner loop, and the number of BBs is exactly 5.
1216          Vectorizable outer-loops look like this:
1217
1218                         (pre-header)
1219                            |
1220                           header <---+
1221                            |         |
1222                           inner-loop |
1223                            |         |
1224                           tail ------+
1225                            |
1226                         (exit-bb)
1227
1228          The inner-loop has the properties expected of inner-most loops
1229          as described above.  */
1230
1231       if ((loop->inner)->inner || (loop->inner)->next)
1232         {
1233           if (dump_enabled_p ())
1234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                              "not vectorized: multiple nested loops.\n");
1236           return false;
1237         }
1238
1239       if (loop->num_nodes != 5)
1240         {
1241           if (dump_enabled_p ())
1242             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1243                              "not vectorized: control flow in loop.\n");
1244           return false;
1245         }
1246
1247       entryedge = loop_preheader_edge (innerloop);
1248       if (entryedge->src != loop->header
1249           || !single_exit (innerloop)
1250           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1251         {
1252           if (dump_enabled_p ())
1253             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254                              "not vectorized: unsupported outerloop form.\n");
1255           return false;
1256         }
1257
1258       /* Analyze the inner-loop.  */
1259       tree inner_niterm1, inner_niter, inner_assumptions;
1260       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1261                                       &inner_assumptions, &inner_niterm1,
1262                                       &inner_niter, NULL)
1263           /* Don't support analyzing niter under assumptions for inner
1264              loop.  */
1265           || !integer_onep (inner_assumptions))
1266         {
1267           if (dump_enabled_p ())
1268             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269                              "not vectorized: Bad inner loop.\n");
1270           return false;
1271         }
1272
1273       if (!expr_invariant_in_loop_p (loop, inner_niter))
1274         {
1275           if (dump_enabled_p ())
1276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1277                              "not vectorized: inner-loop count not"
1278                              " invariant.\n");
1279           return false;
1280         }
1281
1282       if (dump_enabled_p ())
1283         dump_printf_loc (MSG_NOTE, vect_location,
1284                          "Considering outer-loop vectorization.\n");
1285     }
1286
1287   if (!single_exit (loop)
1288       || EDGE_COUNT (loop->header->preds) != 2)
1289     {
1290       if (dump_enabled_p ())
1291         {
1292           if (!single_exit (loop))
1293             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1294                              "not vectorized: multiple exits.\n");
1295           else if (EDGE_COUNT (loop->header->preds) != 2)
1296             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1297                              "not vectorized: too many incoming edges.\n");
1298         }
1299       return false;
1300     }
1301
1302   /* We assume that the loop exit condition is at the end of the loop. i.e,
1303      that the loop is represented as a do-while (with a proper if-guard
1304      before the loop if needed), where the loop header contains all the
1305      executable statements, and the latch is empty.  */
1306   if (!empty_block_p (loop->latch)
1307       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1308     {
1309       if (dump_enabled_p ())
1310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311                          "not vectorized: latch block not empty.\n");
1312       return false;
1313     }
1314
1315   /* Make sure the exit is not abnormal.  */
1316   edge e = single_exit (loop);
1317   if (e->flags & EDGE_ABNORMAL)
1318     {
1319       if (dump_enabled_p ())
1320         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1321                          "not vectorized: abnormal loop exit edge.\n");
1322       return false;
1323     }
1324
1325   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1326                                      number_of_iterationsm1);
1327   if (!*loop_cond)
1328     {
1329       if (dump_enabled_p ())
1330         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331                          "not vectorized: complicated exit condition.\n");
1332       return false;
1333     }
1334
1335   if (integer_zerop (*assumptions)
1336       || !*number_of_iterations
1337       || chrec_contains_undetermined (*number_of_iterations))
1338     {
1339       if (dump_enabled_p ())
1340         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1341                          "not vectorized: number of iterations cannot be "
1342                          "computed.\n");
1343       return false;
1344     }
1345
1346   if (integer_zerop (*number_of_iterations))
1347     {
1348       if (dump_enabled_p ())
1349         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1350                          "not vectorized: number of iterations = 0.\n");
1351       return false;
1352     }
1353
1354   return true;
1355 }
1356
1357 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1358
1359 loop_vec_info
1360 vect_analyze_loop_form (struct loop *loop)
1361 {
1362   tree assumptions, number_of_iterations, number_of_iterationsm1;
1363   gcond *loop_cond, *inner_loop_cond = NULL;
1364
1365   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1366                                   &assumptions, &number_of_iterationsm1,
1367                                   &number_of_iterations, &inner_loop_cond))
1368     return NULL;
1369
1370   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1371   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1372   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1373   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1374   if (!integer_onep (assumptions))
1375     {
1376       /* We consider to vectorize this loop by versioning it under
1377          some assumptions.  In order to do this, we need to clear
1378          existing information computed by scev and niter analyzer.  */
1379       scev_reset_htab ();
1380       free_numbers_of_iterations_estimates (loop);
1381       /* Also set flag for this loop so that following scev and niter
1382          analysis are done under the assumptions.  */
1383       loop_constraint_set (loop, LOOP_C_FINITE);
1384       /* Also record the assumptions for versioning.  */
1385       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1386     }
1387
1388   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1389     {
1390       if (dump_enabled_p ())
1391         {
1392           dump_printf_loc (MSG_NOTE, vect_location,
1393                            "Symbolic number of iterations is ");
1394           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1395           dump_printf (MSG_NOTE, "\n");
1396         }
1397     }
1398
1399   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1400   if (inner_loop_cond)
1401     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1402       = loop_exit_ctrl_vec_info_type;
1403
1404   gcc_assert (!loop->aux);
1405   loop->aux = loop_vinfo;
1406   return loop_vinfo;
1407 }
1408
1409
1410
1411 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1412    statements update the vectorization factor.  */
1413
1414 static void
1415 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1416 {
1417   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1418   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1419   int nbbs = loop->num_nodes;
1420   poly_uint64 vectorization_factor;
1421   int i;
1422
1423   if (dump_enabled_p ())
1424     dump_printf_loc (MSG_NOTE, vect_location,
1425                      "=== vect_update_vf_for_slp ===\n");
1426
1427   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1428   gcc_assert (known_ne (vectorization_factor, 0U));
1429
1430   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1431      vectorization factor of the loop is the unrolling factor required by
1432      the SLP instances.  If that unrolling factor is 1, we say, that we
1433      perform pure SLP on loop - cross iteration parallelism is not
1434      exploited.  */
1435   bool only_slp_in_loop = true;
1436   for (i = 0; i < nbbs; i++)
1437     {
1438       basic_block bb = bbs[i];
1439       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1440            gsi_next (&si))
1441         {
1442           gimple *stmt = gsi_stmt (si);
1443           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1444           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1445               && STMT_VINFO_RELATED_STMT (stmt_info))
1446             {
1447               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1448               stmt_info = vinfo_for_stmt (stmt);
1449             }
1450           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1451                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1452               && !PURE_SLP_STMT (stmt_info))
1453             /* STMT needs both SLP and loop-based vectorization.  */
1454             only_slp_in_loop = false;
1455         }
1456     }
1457
1458   if (only_slp_in_loop)
1459     {
1460       dump_printf_loc (MSG_NOTE, vect_location,
1461                        "Loop contains only SLP stmts\n");
1462       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1463     }
1464   else
1465     {
1466       dump_printf_loc (MSG_NOTE, vect_location,
1467                        "Loop contains SLP and non-SLP stmts\n");
1468       /* Both the vectorization factor and unroll factor have the form
1469          current_vector_size * X for some rational X, so they must have
1470          a common multiple.  */
1471       vectorization_factor
1472         = force_common_multiple (vectorization_factor,
1473                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1474     }
1475
1476   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1477   if (dump_enabled_p ())
1478     {
1479       dump_printf_loc (MSG_NOTE, vect_location,
1480                        "Updating vectorization factor to ");
1481       dump_dec (MSG_NOTE, vectorization_factor);
1482       dump_printf (MSG_NOTE, ".\n");
1483     }
1484 }
1485
1486 /* Return true if STMT_INFO describes a double reduction phi and if
1487    the other phi in the reduction is also relevant for vectorization.
1488    This rejects cases such as:
1489
1490       outer1:
1491         x_1 = PHI <x_3(outer2), ...>;
1492         ...
1493
1494       inner:
1495         x_2 = ...;
1496         ...
1497
1498       outer2:
1499         x_3 = PHI <x_2(inner)>;
1500
1501    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1502
1503 static bool
1504 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1505 {
1506   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1507     return false;
1508
1509   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1510   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1511 }
1512
1513 /* Function vect_analyze_loop_operations.
1514
1515    Scan the loop stmts and make sure they are all vectorizable.  */
1516
1517 static bool
1518 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1519 {
1520   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1521   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1522   int nbbs = loop->num_nodes;
1523   int i;
1524   stmt_vec_info stmt_info;
1525   bool need_to_vectorize = false;
1526   bool ok;
1527
1528   if (dump_enabled_p ())
1529     dump_printf_loc (MSG_NOTE, vect_location,
1530                      "=== vect_analyze_loop_operations ===\n");
1531
1532   stmt_vector_for_cost cost_vec;
1533   cost_vec.create (2);
1534
1535   for (i = 0; i < nbbs; i++)
1536     {
1537       basic_block bb = bbs[i];
1538
1539       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1540            gsi_next (&si))
1541         {
1542           gphi *phi = si.phi ();
1543           ok = true;
1544
1545           stmt_info = vinfo_for_stmt (phi);
1546           if (dump_enabled_p ())
1547             {
1548               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1549               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1550             }
1551           if (virtual_operand_p (gimple_phi_result (phi)))
1552             continue;
1553
1554           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1555              (i.e., a phi in the tail of the outer-loop).  */
1556           if (! is_loop_header_bb_p (bb))
1557             {
1558               /* FORNOW: we currently don't support the case that these phis
1559                  are not used in the outerloop (unless it is double reduction,
1560                  i.e., this phi is vect_reduction_def), cause this case
1561                  requires to actually do something here.  */
1562               if (STMT_VINFO_LIVE_P (stmt_info)
1563                   && !vect_active_double_reduction_p (stmt_info))
1564                 {
1565                   if (dump_enabled_p ())
1566                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1567                                      "Unsupported loop-closed phi in "
1568                                      "outer-loop.\n");
1569                   return false;
1570                 }
1571
1572               /* If PHI is used in the outer loop, we check that its operand
1573                  is defined in the inner loop.  */
1574               if (STMT_VINFO_RELEVANT_P (stmt_info))
1575                 {
1576                   tree phi_op;
1577                   gimple *op_def_stmt;
1578
1579                   if (gimple_phi_num_args (phi) != 1)
1580                     return false;
1581
1582                   phi_op = PHI_ARG_DEF (phi, 0);
1583                   if (TREE_CODE (phi_op) != SSA_NAME)
1584                     return false;
1585
1586                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1587                   if (gimple_nop_p (op_def_stmt)
1588                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1589                       || !vinfo_for_stmt (op_def_stmt))
1590                     return false;
1591
1592                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1593                         != vect_used_in_outer
1594                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1595                            != vect_used_in_outer_by_reduction)
1596                     return false;
1597                 }
1598
1599               continue;
1600             }
1601
1602           gcc_assert (stmt_info);
1603
1604           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1605                || STMT_VINFO_LIVE_P (stmt_info))
1606               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1607             {
1608               /* A scalar-dependence cycle that we don't support.  */
1609               if (dump_enabled_p ())
1610                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611                                  "not vectorized: scalar dependence cycle.\n");
1612               return false;
1613             }
1614
1615           if (STMT_VINFO_RELEVANT_P (stmt_info))
1616             {
1617               need_to_vectorize = true;
1618               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1619                   && ! PURE_SLP_STMT (stmt_info))
1620                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1621               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1622                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1623                        && ! PURE_SLP_STMT (stmt_info))
1624                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1625                                              &cost_vec);
1626             }
1627
1628           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1629           if (ok
1630               && STMT_VINFO_LIVE_P (stmt_info)
1631               && !PURE_SLP_STMT (stmt_info))
1632             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1633                                               &cost_vec);
1634
1635           if (!ok)
1636             {
1637               if (dump_enabled_p ())
1638                 {
1639                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1640                                    "not vectorized: relevant phi not "
1641                                    "supported: ");
1642                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1643                 }
1644               return false;
1645             }
1646         }
1647
1648       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1649            gsi_next (&si))
1650         {
1651           gimple *stmt = gsi_stmt (si);
1652           if (!gimple_clobber_p (stmt)
1653               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1654                                      &cost_vec))
1655             return false;
1656         }
1657     } /* bbs */
1658
1659   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1660   cost_vec.release ();
1661
1662   /* All operations in the loop are either irrelevant (deal with loop
1663      control, or dead), or only used outside the loop and can be moved
1664      out of the loop (e.g. invariants, inductions).  The loop can be
1665      optimized away by scalar optimizations.  We're better off not
1666      touching this loop.  */
1667   if (!need_to_vectorize)
1668     {
1669       if (dump_enabled_p ())
1670         dump_printf_loc (MSG_NOTE, vect_location,
1671                          "All the computation can be taken out of the loop.\n");
1672       if (dump_enabled_p ())
1673         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1674                          "not vectorized: redundant loop. no profit to "
1675                          "vectorize.\n");
1676       return false;
1677     }
1678
1679   return true;
1680 }
1681
1682 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1683    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1684    definitely no, or -1 if it's worth retrying.  */
1685
1686 static int
1687 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1688 {
1689   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1690   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1691
1692   /* Only fully-masked loops can have iteration counts less than the
1693      vectorization factor.  */
1694   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1695     {
1696       HOST_WIDE_INT max_niter;
1697
1698       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1699         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1700       else
1701         max_niter = max_stmt_executions_int (loop);
1702
1703       if (max_niter != -1
1704           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1705         {
1706           if (dump_enabled_p ())
1707             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1708                              "not vectorized: iteration count smaller than "
1709                              "vectorization factor.\n");
1710           return 0;
1711         }
1712     }
1713
1714   int min_profitable_iters, min_profitable_estimate;
1715   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1716                                       &min_profitable_estimate);
1717
1718   if (min_profitable_iters < 0)
1719     {
1720       if (dump_enabled_p ())
1721         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1722                          "not vectorized: vectorization not profitable.\n");
1723       if (dump_enabled_p ())
1724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725                          "not vectorized: vector version will never be "
1726                          "profitable.\n");
1727       return -1;
1728     }
1729
1730   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1731                                * assumed_vf);
1732
1733   /* Use the cost model only if it is more conservative than user specified
1734      threshold.  */
1735   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1736                                     min_profitable_iters);
1737
1738   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1739
1740   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1741       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1742     {
1743       if (dump_enabled_p ())
1744         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1745                          "not vectorized: vectorization not profitable.\n");
1746       if (dump_enabled_p ())
1747         dump_printf_loc (MSG_NOTE, vect_location,
1748                          "not vectorized: iteration count smaller than user "
1749                          "specified loop bound parameter or minimum profitable "
1750                          "iterations (whichever is more conservative).\n");
1751       return 0;
1752     }
1753
1754   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1755   if (estimated_niter == -1)
1756     estimated_niter = likely_max_stmt_executions_int (loop);
1757   if (estimated_niter != -1
1758       && ((unsigned HOST_WIDE_INT) estimated_niter
1759           < MAX (th, (unsigned) min_profitable_estimate)))
1760     {
1761       if (dump_enabled_p ())
1762         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1763                          "not vectorized: estimated iteration count too "
1764                          "small.\n");
1765       if (dump_enabled_p ())
1766         dump_printf_loc (MSG_NOTE, vect_location,
1767                          "not vectorized: estimated iteration count smaller "
1768                          "than specified loop bound parameter or minimum "
1769                          "profitable iterations (whichever is more "
1770                          "conservative).\n");
1771       return -1;
1772     }
1773
1774   return 1;
1775 }
1776
1777 static bool
1778 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1779                            vec<data_reference_p> *datarefs,
1780                            unsigned int *n_stmts)
1781 {
1782   *n_stmts = 0;
1783   for (unsigned i = 0; i < loop->num_nodes; i++)
1784     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1785          !gsi_end_p (gsi); gsi_next (&gsi))
1786       {
1787         gimple *stmt = gsi_stmt (gsi);
1788         if (is_gimple_debug (stmt))
1789           continue;
1790         ++(*n_stmts);
1791         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1792           {
1793             if (is_gimple_call (stmt) && loop->safelen)
1794               {
1795                 tree fndecl = gimple_call_fndecl (stmt), op;
1796                 if (fndecl != NULL_TREE)
1797                   {
1798                     cgraph_node *node = cgraph_node::get (fndecl);
1799                     if (node != NULL && node->simd_clones != NULL)
1800                       {
1801                         unsigned int j, n = gimple_call_num_args (stmt);
1802                         for (j = 0; j < n; j++)
1803                           {
1804                             op = gimple_call_arg (stmt, j);
1805                             if (DECL_P (op)
1806                                 || (REFERENCE_CLASS_P (op)
1807                                     && get_base_address (op)))
1808                               break;
1809                           }
1810                         op = gimple_call_lhs (stmt);
1811                         /* Ignore #pragma omp declare simd functions
1812                            if they don't have data references in the
1813                            call stmt itself.  */
1814                         if (j == n
1815                             && !(op
1816                                  && (DECL_P (op)
1817                                      || (REFERENCE_CLASS_P (op)
1818                                          && get_base_address (op)))))
1819                           continue;
1820                       }
1821                   }
1822               }
1823             return false;
1824           }
1825       }
1826   return true;
1827 }
1828
1829 /* Function vect_analyze_loop_2.
1830
1831    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1832    for it.  The different analyses will record information in the
1833    loop_vec_info struct.  */
1834 static bool
1835 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1836 {
1837   bool ok;
1838   int res;
1839   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1840   poly_uint64 min_vf = 2;
1841
1842   /* The first group of checks is independent of the vector size.  */
1843   fatal = true;
1844
1845   /* Find all data references in the loop (which correspond to vdefs/vuses)
1846      and analyze their evolution in the loop.  */
1847
1848   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1849   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1850     {
1851       if (dump_enabled_p ())
1852         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853                          "not vectorized: loop nest containing two "
1854                          "or more consecutive inner loops cannot be "
1855                          "vectorized\n");
1856       return false;
1857     }
1858
1859   /* Gather the data references and count stmts in the loop.  */
1860   unsigned int n_stmts;
1861   if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1862                                   &LOOP_VINFO_DATAREFS (loop_vinfo),
1863                                   &n_stmts))
1864     {
1865       if (dump_enabled_p ())
1866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                          "not vectorized: loop contains function "
1868                          "calls or data references that cannot "
1869                          "be analyzed\n");
1870       return false;
1871     }
1872
1873   /* Analyze the data references and also adjust the minimal
1874      vectorization factor according to the loads and stores.  */
1875
1876   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1877   if (!ok)
1878     {
1879       if (dump_enabled_p ())
1880         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1881                          "bad data references.\n");
1882       return false;
1883     }
1884
1885   /* Classify all cross-iteration scalar data-flow cycles.
1886      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1887   vect_analyze_scalar_cycles (loop_vinfo);
1888
1889   vect_pattern_recog (loop_vinfo);
1890
1891   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1892
1893   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1894      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1895
1896   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1897   if (!ok)
1898     {
1899       if (dump_enabled_p ())
1900         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1901                          "bad data access.\n");
1902       return false;
1903     }
1904
1905   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1906
1907   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1908   if (!ok)
1909     {
1910       if (dump_enabled_p ())
1911         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1912                          "unexpected pattern.\n");
1913       return false;
1914     }
1915
1916   /* While the rest of the analysis below depends on it in some way.  */
1917   fatal = false;
1918
1919   /* Analyze data dependences between the data-refs in the loop
1920      and adjust the maximum vectorization factor according to
1921      the dependences.
1922      FORNOW: fail at the first data dependence that we encounter.  */
1923
1924   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1925   if (!ok
1926       || (max_vf != MAX_VECTORIZATION_FACTOR
1927           && maybe_lt (max_vf, min_vf)))
1928     {
1929       if (dump_enabled_p ())
1930             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931                              "bad data dependence.\n");
1932       return false;
1933     }
1934   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1935
1936   ok = vect_determine_vectorization_factor (loop_vinfo);
1937   if (!ok)
1938     {
1939       if (dump_enabled_p ())
1940         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941                          "can't determine vectorization factor.\n");
1942       return false;
1943     }
1944   if (max_vf != MAX_VECTORIZATION_FACTOR
1945       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1946     {
1947       if (dump_enabled_p ())
1948         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1949                          "bad data dependence.\n");
1950       return false;
1951     }
1952
1953   /* Compute the scalar iteration cost.  */
1954   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1955
1956   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1957   unsigned th;
1958
1959   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1960   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1961   if (!ok)
1962     return false;
1963
1964   /* If there are any SLP instances mark them as pure_slp.  */
1965   bool slp = vect_make_slp_decision (loop_vinfo);
1966   if (slp)
1967     {
1968       /* Find stmts that need to be both vectorized and SLPed.  */
1969       vect_detect_hybrid_slp (loop_vinfo);
1970
1971       /* Update the vectorization factor based on the SLP decision.  */
1972       vect_update_vf_for_slp (loop_vinfo);
1973     }
1974
1975   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1976
1977   /* We don't expect to have to roll back to anything other than an empty
1978      set of rgroups.  */
1979   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1980
1981   /* This is the point where we can re-start analysis with SLP forced off.  */
1982 start_over:
1983
1984   /* Now the vectorization factor is final.  */
1985   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1986   gcc_assert (known_ne (vectorization_factor, 0U));
1987
1988   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1989     {
1990       dump_printf_loc (MSG_NOTE, vect_location,
1991                        "vectorization_factor = ");
1992       dump_dec (MSG_NOTE, vectorization_factor);
1993       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1994                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1995     }
1996
1997   HOST_WIDE_INT max_niter
1998     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1999
2000   /* Analyze the alignment of the data-refs in the loop.
2001      Fail if a data reference is found that cannot be vectorized.  */
2002
2003   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2004   if (!ok)
2005     {
2006       if (dump_enabled_p ())
2007         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2008                          "bad data alignment.\n");
2009       return false;
2010     }
2011
2012   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2013      It is important to call pruning after vect_analyze_data_ref_accesses,
2014      since we use grouping information gathered by interleaving analysis.  */
2015   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2016   if (!ok)
2017     return false;
2018
2019   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2020      vectorization.  */
2021   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2022     {
2023     /* This pass will decide on using loop versioning and/or loop peeling in
2024        order to enhance the alignment of data references in the loop.  */
2025     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2026     if (!ok)
2027       {
2028         if (dump_enabled_p ())
2029           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2030                            "bad data alignment.\n");
2031         return false;
2032       }
2033     }
2034
2035   if (slp)
2036     {
2037       /* Analyze operations in the SLP instances.  Note this may
2038          remove unsupported SLP instances which makes the above
2039          SLP kind detection invalid.  */
2040       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2041       vect_slp_analyze_operations (loop_vinfo);
2042       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2043         goto again;
2044     }
2045
2046   /* Scan all the remaining operations in the loop that are not subject
2047      to SLP and make sure they are vectorizable.  */
2048   ok = vect_analyze_loop_operations (loop_vinfo);
2049   if (!ok)
2050     {
2051       if (dump_enabled_p ())
2052         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2053                          "bad operation or unsupported loop bound.\n");
2054       return false;
2055     }
2056
2057   /* Decide whether to use a fully-masked loop for this vectorization
2058      factor.  */
2059   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2060     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2061        && vect_verify_full_masking (loop_vinfo));
2062   if (dump_enabled_p ())
2063     {
2064       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2065         dump_printf_loc (MSG_NOTE, vect_location,
2066                          "using a fully-masked loop.\n");
2067       else
2068         dump_printf_loc (MSG_NOTE, vect_location,
2069                          "not using a fully-masked loop.\n");
2070     }
2071
2072   /* If epilog loop is required because of data accesses with gaps,
2073      one additional iteration needs to be peeled.  Check if there is
2074      enough iterations for vectorization.  */
2075   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2076       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2077       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2078     {
2079       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2080       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2081
2082       if (known_lt (wi::to_widest (scalar_niters), vf))
2083         {
2084           if (dump_enabled_p ())
2085             dump_printf_loc (MSG_NOTE, vect_location,
2086                              "loop has no enough iterations to support"
2087                              " peeling for gaps.\n");
2088           return false;
2089         }
2090     }
2091
2092   /* Check the costings of the loop make vectorizing worthwhile.  */
2093   res = vect_analyze_loop_costing (loop_vinfo);
2094   if (res < 0)
2095     goto again;
2096   if (!res)
2097     {
2098       if (dump_enabled_p ())
2099         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100                          "Loop costings not worthwhile.\n");
2101       return false;
2102     }
2103
2104   /* Decide whether we need to create an epilogue loop to handle
2105      remaining scalar iterations.  */
2106   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2107
2108   unsigned HOST_WIDE_INT const_vf;
2109   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2110     /* The main loop handles all iterations.  */
2111     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2112   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2113            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2114     {
2115       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2116                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2117                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2118         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2119     }
2120   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2121            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2122            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2123                 < (unsigned) exact_log2 (const_vf))
2124                /* In case of versioning, check if the maximum number of
2125                   iterations is greater than th.  If they are identical,
2126                   the epilogue is unnecessary.  */
2127                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2128                    || ((unsigned HOST_WIDE_INT) max_niter
2129                        > (th / const_vf) * const_vf))))
2130     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2131
2132   /* If an epilogue loop is required make sure we can create one.  */
2133   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2135     {
2136       if (dump_enabled_p ())
2137         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2138       if (!vect_can_advance_ivs_p (loop_vinfo)
2139           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2140                                            single_exit (LOOP_VINFO_LOOP
2141                                                          (loop_vinfo))))
2142         {
2143           if (dump_enabled_p ())
2144             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2145                              "not vectorized: can't create required "
2146                              "epilog loop\n");
2147           goto again;
2148         }
2149     }
2150
2151   /* During peeling, we need to check if number of loop iterations is
2152      enough for both peeled prolog loop and vector loop.  This check
2153      can be merged along with threshold check of loop versioning, so
2154      increase threshold for this case if necessary.  */
2155   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2156     {
2157       poly_uint64 niters_th = 0;
2158
2159       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2160         {
2161           /* Niters for peeled prolog loop.  */
2162           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2163             {
2164               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2165               tree vectype
2166                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2167               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2168             }
2169           else
2170             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2171         }
2172
2173       /* Niters for at least one iteration of vectorized loop.  */
2174       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2175         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2176       /* One additional iteration because of peeling for gap.  */
2177       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2178         niters_th += 1;
2179       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2180     }
2181
2182   gcc_assert (known_eq (vectorization_factor,
2183                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2184
2185   /* Ok to vectorize!  */
2186   return true;
2187
2188 again:
2189   /* Try again with SLP forced off but if we didn't do any SLP there is
2190      no point in re-trying.  */
2191   if (!slp)
2192     return false;
2193
2194   /* If there are reduction chains re-trying will fail anyway.  */
2195   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2196     return false;
2197
2198   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2199      via interleaving or lane instructions.  */
2200   slp_instance instance;
2201   slp_tree node;
2202   unsigned i, j;
2203   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2204     {
2205       stmt_vec_info vinfo;
2206       vinfo = vinfo_for_stmt
2207           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2208       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2209         continue;
2210       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2211       unsigned int size = DR_GROUP_SIZE (vinfo);
2212       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2213       if (! vect_store_lanes_supported (vectype, size, false)
2214          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2215          && ! vect_grouped_store_supported (vectype, size))
2216        return false;
2217       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2218         {
2219           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2220           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2221           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2222           size = DR_GROUP_SIZE (vinfo);
2223           vectype = STMT_VINFO_VECTYPE (vinfo);
2224           if (! vect_load_lanes_supported (vectype, size, false)
2225               && ! vect_grouped_load_supported (vectype, single_element_p,
2226                                                 size))
2227             return false;
2228         }
2229     }
2230
2231   if (dump_enabled_p ())
2232     dump_printf_loc (MSG_NOTE, vect_location,
2233                      "re-trying with SLP disabled\n");
2234
2235   /* Roll back state appropriately.  No SLP this time.  */
2236   slp = false;
2237   /* Restore vectorization factor as it were without SLP.  */
2238   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2239   /* Free the SLP instances.  */
2240   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2241     vect_free_slp_instance (instance);
2242   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2243   /* Reset SLP type to loop_vect on all stmts.  */
2244   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2245     {
2246       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2247       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2248            !gsi_end_p (si); gsi_next (&si))
2249         {
2250           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2251           STMT_SLP_TYPE (stmt_info) = loop_vect;
2252         }
2253       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2254            !gsi_end_p (si); gsi_next (&si))
2255         {
2256           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2257           STMT_SLP_TYPE (stmt_info) = loop_vect;
2258           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2259             {
2260               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2261               STMT_SLP_TYPE (stmt_info) = loop_vect;
2262               for (gimple_stmt_iterator pi
2263                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2264                    !gsi_end_p (pi); gsi_next (&pi))
2265                 {
2266                   gimple *pstmt = gsi_stmt (pi);
2267                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2268                 }
2269             }
2270         }
2271     }
2272   /* Free optimized alias test DDRS.  */
2273   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2274   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2275   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2276   /* Reset target cost data.  */
2277   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2278   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2279     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2280   /* Reset accumulated rgroup information.  */
2281   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2282   /* Reset assorted flags.  */
2283   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2284   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2285   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2286   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2287   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2288
2289   goto start_over;
2290 }
2291
2292 /* Function vect_analyze_loop.
2293
2294    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2295    for it.  The different analyses will record information in the
2296    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2297    be vectorized.  */
2298 loop_vec_info
2299 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2300 {
2301   loop_vec_info loop_vinfo;
2302   auto_vector_sizes vector_sizes;
2303
2304   /* Autodetect first vector size we try.  */
2305   current_vector_size = 0;
2306   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2307   unsigned int next_size = 0;
2308
2309   if (dump_enabled_p ())
2310     dump_printf_loc (MSG_NOTE, vect_location,
2311                      "===== analyze_loop_nest =====\n");
2312
2313   if (loop_outer (loop)
2314       && loop_vec_info_for_loop (loop_outer (loop))
2315       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2316     {
2317       if (dump_enabled_p ())
2318         dump_printf_loc (MSG_NOTE, vect_location,
2319                          "outer-loop already vectorized.\n");
2320       return NULL;
2321     }
2322
2323   poly_uint64 autodetected_vector_size = 0;
2324   while (1)
2325     {
2326       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2327       loop_vinfo = vect_analyze_loop_form (loop);
2328       if (!loop_vinfo)
2329         {
2330           if (dump_enabled_p ())
2331             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2332                              "bad loop form.\n");
2333           return NULL;
2334         }
2335
2336       bool fatal = false;
2337
2338       if (orig_loop_vinfo)
2339         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2340
2341       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2342         {
2343           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2344
2345           return loop_vinfo;
2346         }
2347
2348       delete loop_vinfo;
2349
2350       if (next_size == 0)
2351         autodetected_vector_size = current_vector_size;
2352
2353       if (next_size < vector_sizes.length ()
2354           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2355         next_size += 1;
2356
2357       if (fatal
2358           || next_size == vector_sizes.length ()
2359           || known_eq (current_vector_size, 0U))
2360         return NULL;
2361
2362       /* Try the next biggest vector size.  */
2363       current_vector_size = vector_sizes[next_size++];
2364       if (dump_enabled_p ())
2365         {
2366           dump_printf_loc (MSG_NOTE, vect_location,
2367                            "***** Re-trying analysis with "
2368                            "vector size ");
2369           dump_dec (MSG_NOTE, current_vector_size);
2370           dump_printf (MSG_NOTE, "\n");
2371         }
2372     }
2373 }
2374
2375 /* Return true if there is an in-order reduction function for CODE, storing
2376    it in *REDUC_FN if so.  */
2377
2378 static bool
2379 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2380 {
2381   switch (code)
2382     {
2383     case PLUS_EXPR:
2384       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2385       return true;
2386
2387     default:
2388       return false;
2389     }
2390 }
2391
2392 /* Function reduction_fn_for_scalar_code
2393
2394    Input:
2395    CODE - tree_code of a reduction operations.
2396
2397    Output:
2398    REDUC_FN - the corresponding internal function to be used to reduce the
2399       vector of partial results into a single scalar result, or IFN_LAST
2400       if the operation is a supported reduction operation, but does not have
2401       such an internal function.
2402
2403    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2404
2405 static bool
2406 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2407 {
2408   switch (code)
2409     {
2410       case MAX_EXPR:
2411         *reduc_fn = IFN_REDUC_MAX;
2412         return true;
2413
2414       case MIN_EXPR:
2415         *reduc_fn = IFN_REDUC_MIN;
2416         return true;
2417
2418       case PLUS_EXPR:
2419         *reduc_fn = IFN_REDUC_PLUS;
2420         return true;
2421
2422       case BIT_AND_EXPR:
2423         *reduc_fn = IFN_REDUC_AND;
2424         return true;
2425
2426       case BIT_IOR_EXPR:
2427         *reduc_fn = IFN_REDUC_IOR;
2428         return true;
2429
2430       case BIT_XOR_EXPR:
2431         *reduc_fn = IFN_REDUC_XOR;
2432         return true;
2433
2434       case MULT_EXPR:
2435       case MINUS_EXPR:
2436         *reduc_fn = IFN_LAST;
2437         return true;
2438
2439       default:
2440        return false;
2441     }
2442 }
2443
2444 /* If there is a neutral value X such that SLP reduction NODE would not
2445    be affected by the introduction of additional X elements, return that X,
2446    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2447    is true if the SLP statements perform a single reduction, false if each
2448    statement performs an independent reduction.  */
2449
2450 static tree
2451 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2452                               bool reduc_chain)
2453 {
2454   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2455   gimple *stmt = stmts[0];
2456   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2457   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2458   tree scalar_type = TREE_TYPE (vector_type);
2459   struct loop *loop = gimple_bb (stmt)->loop_father;
2460   gcc_assert (loop);
2461
2462   switch (code)
2463     {
2464     case WIDEN_SUM_EXPR:
2465     case DOT_PROD_EXPR:
2466     case SAD_EXPR:
2467     case PLUS_EXPR:
2468     case MINUS_EXPR:
2469     case BIT_IOR_EXPR:
2470     case BIT_XOR_EXPR:
2471       return build_zero_cst (scalar_type);
2472
2473     case MULT_EXPR:
2474       return build_one_cst (scalar_type);
2475
2476     case BIT_AND_EXPR:
2477       return build_all_ones_cst (scalar_type);
2478
2479     case MAX_EXPR:
2480     case MIN_EXPR:
2481       /* For MIN/MAX the initial values are neutral.  A reduction chain
2482          has only a single initial value, so that value is neutral for
2483          all statements.  */
2484       if (reduc_chain)
2485         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2486       return NULL_TREE;
2487
2488     default:
2489       return NULL_TREE;
2490     }
2491 }
2492
2493 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2494    STMT is printed with a message MSG. */
2495
2496 static void
2497 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2498 {
2499   dump_printf_loc (msg_type, vect_location, "%s", msg);
2500   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2501 }
2502
2503
2504 /* Detect SLP reduction of the form:
2505
2506    #a1 = phi <a5, a0>
2507    a2 = operation (a1)
2508    a3 = operation (a2)
2509    a4 = operation (a3)
2510    a5 = operation (a4)
2511
2512    #a = phi <a5>
2513
2514    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2515    FIRST_STMT is the first reduction stmt in the chain
2516    (a2 = operation (a1)).
2517
2518    Return TRUE if a reduction chain was detected.  */
2519
2520 static bool
2521 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2522                        gimple *first_stmt)
2523 {
2524   struct loop *loop = (gimple_bb (phi))->loop_father;
2525   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2526   enum tree_code code;
2527   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2528   stmt_vec_info use_stmt_info, current_stmt_info;
2529   tree lhs;
2530   imm_use_iterator imm_iter;
2531   use_operand_p use_p;
2532   int nloop_uses, size = 0, n_out_of_loop_uses;
2533   bool found = false;
2534
2535   if (loop != vect_loop)
2536     return false;
2537
2538   lhs = PHI_RESULT (phi);
2539   code = gimple_assign_rhs_code (first_stmt);
2540   while (1)
2541     {
2542       nloop_uses = 0;
2543       n_out_of_loop_uses = 0;
2544       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2545         {
2546           gimple *use_stmt = USE_STMT (use_p);
2547           if (is_gimple_debug (use_stmt))
2548             continue;
2549
2550           /* Check if we got back to the reduction phi.  */
2551           if (use_stmt == phi)
2552             {
2553               loop_use_stmt = use_stmt;
2554               found = true;
2555               break;
2556             }
2557
2558           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2559             {
2560               loop_use_stmt = use_stmt;
2561               nloop_uses++;
2562             }
2563            else
2564              n_out_of_loop_uses++;
2565
2566            /* There are can be either a single use in the loop or two uses in
2567               phi nodes.  */
2568            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2569              return false;
2570         }
2571
2572       if (found)
2573         break;
2574
2575       /* We reached a statement with no loop uses.  */
2576       if (nloop_uses == 0)
2577         return false;
2578
2579       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2580       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2581         return false;
2582
2583       if (!is_gimple_assign (loop_use_stmt)
2584           || code != gimple_assign_rhs_code (loop_use_stmt)
2585           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2586         return false;
2587
2588       /* Insert USE_STMT into reduction chain.  */
2589       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2590       if (current_stmt)
2591         {
2592           current_stmt_info = vinfo_for_stmt (current_stmt);
2593           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2594           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2595             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2596         }
2597       else
2598         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2599
2600       lhs = gimple_assign_lhs (loop_use_stmt);
2601       current_stmt = loop_use_stmt;
2602       size++;
2603    }
2604
2605   if (!found || loop_use_stmt != phi || size < 2)
2606     return false;
2607
2608   /* Swap the operands, if needed, to make the reduction operand be the second
2609      operand.  */
2610   lhs = PHI_RESULT (phi);
2611   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2612   while (next_stmt)
2613     {
2614       if (gimple_assign_rhs2 (next_stmt) == lhs)
2615         {
2616           tree op = gimple_assign_rhs1 (next_stmt);
2617           gimple *def_stmt = NULL;
2618
2619           if (TREE_CODE (op) == SSA_NAME)
2620             def_stmt = SSA_NAME_DEF_STMT (op);
2621
2622           /* Check that the other def is either defined in the loop
2623              ("vect_internal_def"), or it's an induction (defined by a
2624              loop-header phi-node).  */
2625           if (def_stmt
2626               && gimple_bb (def_stmt)
2627               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2628               && (is_gimple_assign (def_stmt)
2629                   || is_gimple_call (def_stmt)
2630                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2631                            == vect_induction_def
2632                   || (gimple_code (def_stmt) == GIMPLE_PHI
2633                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2634                                   == vect_internal_def
2635                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2636             {
2637               lhs = gimple_assign_lhs (next_stmt);
2638               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2639               continue;
2640             }
2641
2642           return false;
2643         }
2644       else
2645         {
2646           tree op = gimple_assign_rhs2 (next_stmt);
2647           gimple *def_stmt = NULL;
2648
2649           if (TREE_CODE (op) == SSA_NAME)
2650             def_stmt = SSA_NAME_DEF_STMT (op);
2651
2652           /* Check that the other def is either defined in the loop
2653             ("vect_internal_def"), or it's an induction (defined by a
2654             loop-header phi-node).  */
2655           if (def_stmt
2656               && gimple_bb (def_stmt)
2657               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2658               && (is_gimple_assign (def_stmt)
2659                   || is_gimple_call (def_stmt)
2660                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2661                               == vect_induction_def
2662                   || (gimple_code (def_stmt) == GIMPLE_PHI
2663                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2664                                   == vect_internal_def
2665                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2666             {
2667               if (dump_enabled_p ())
2668                 {
2669                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2670                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2671                 }
2672
2673               swap_ssa_operands (next_stmt,
2674                                  gimple_assign_rhs1_ptr (next_stmt),
2675                                  gimple_assign_rhs2_ptr (next_stmt));
2676               update_stmt (next_stmt);
2677
2678               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2679                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2680             }
2681           else
2682             return false;
2683         }
2684
2685       lhs = gimple_assign_lhs (next_stmt);
2686       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2687     }
2688
2689   /* Save the chain for further analysis in SLP detection.  */
2690   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2691   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2692   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2693
2694   return true;
2695 }
2696
2697 /* Return true if we need an in-order reduction for operation CODE
2698    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2699    overflow must wrap.  */
2700
2701 static bool
2702 needs_fold_left_reduction_p (tree type, tree_code code,
2703                              bool need_wrapping_integral_overflow)
2704 {
2705   /* CHECKME: check for !flag_finite_math_only too?  */
2706   if (SCALAR_FLOAT_TYPE_P (type))
2707     switch (code)
2708       {
2709       case MIN_EXPR:
2710       case MAX_EXPR:
2711         return false;
2712
2713       default:
2714         return !flag_associative_math;
2715       }
2716
2717   if (INTEGRAL_TYPE_P (type))
2718     {
2719       if (!operation_no_trapping_overflow (type, code))
2720         return true;
2721       if (need_wrapping_integral_overflow
2722           && !TYPE_OVERFLOW_WRAPS (type)
2723           && operation_can_overflow (code))
2724         return true;
2725       return false;
2726     }
2727
2728   if (SAT_FIXED_POINT_TYPE_P (type))
2729     return true;
2730
2731   return false;
2732 }
2733
2734 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2735    reduction operation CODE has a handled computation expression.  */
2736
2737 bool
2738 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2739                       enum tree_code code)
2740 {
2741   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2742   auto_bitmap visited;
2743   tree lookfor = PHI_RESULT (phi);
2744   ssa_op_iter curri;
2745   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2746   while (USE_FROM_PTR (curr) != loop_arg)
2747     curr = op_iter_next_use (&curri);
2748   curri.i = curri.numops;
2749   do
2750     {
2751       path.safe_push (std::make_pair (curri, curr));
2752       tree use = USE_FROM_PTR (curr);
2753       if (use == lookfor)
2754         break;
2755       gimple *def = SSA_NAME_DEF_STMT (use);
2756       if (gimple_nop_p (def)
2757           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2758         {
2759 pop:
2760           do
2761             {
2762               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2763               curri = x.first;
2764               curr = x.second;
2765               do
2766                 curr = op_iter_next_use (&curri);
2767               /* Skip already visited or non-SSA operands (from iterating
2768                  over PHI args).  */
2769               while (curr != NULL_USE_OPERAND_P
2770                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2771                          || ! bitmap_set_bit (visited,
2772                                               SSA_NAME_VERSION
2773                                                 (USE_FROM_PTR (curr)))));
2774             }
2775           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2776           if (curr == NULL_USE_OPERAND_P)
2777             break;
2778         }
2779       else
2780         {
2781           if (gimple_code (def) == GIMPLE_PHI)
2782             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2783           else
2784             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2785           while (curr != NULL_USE_OPERAND_P
2786                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2787                      || ! bitmap_set_bit (visited,
2788                                           SSA_NAME_VERSION
2789                                             (USE_FROM_PTR (curr)))))
2790             curr = op_iter_next_use (&curri);
2791           if (curr == NULL_USE_OPERAND_P)
2792             goto pop;
2793         }
2794     }
2795   while (1);
2796   if (dump_file && (dump_flags & TDF_DETAILS))
2797     {
2798       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2799       unsigned i;
2800       std::pair<ssa_op_iter, use_operand_p> *x;
2801       FOR_EACH_VEC_ELT (path, i, x)
2802         {
2803           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2804           dump_printf (MSG_NOTE, " ");
2805         }
2806       dump_printf (MSG_NOTE, "\n");
2807     }
2808
2809   /* Check whether the reduction path detected is valid.  */
2810   bool fail = path.length () == 0;
2811   bool neg = false;
2812   for (unsigned i = 1; i < path.length (); ++i)
2813     {
2814       gimple *use_stmt = USE_STMT (path[i].second);
2815       tree op = USE_FROM_PTR (path[i].second);
2816       if (! has_single_use (op)
2817           || ! is_gimple_assign (use_stmt))
2818         {
2819           fail = true;
2820           break;
2821         }
2822       if (gimple_assign_rhs_code (use_stmt) != code)
2823         {
2824           if (code == PLUS_EXPR
2825               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2826             {
2827               /* Track whether we negate the reduction value each iteration.  */
2828               if (gimple_assign_rhs2 (use_stmt) == op)
2829                 neg = ! neg;
2830             }
2831           else
2832             {
2833               fail = true;
2834               break;
2835             }
2836         }
2837     }
2838   return ! fail && ! neg;
2839 }
2840
2841
2842 /* Function vect_is_simple_reduction
2843
2844    (1) Detect a cross-iteration def-use cycle that represents a simple
2845    reduction computation.  We look for the following pattern:
2846
2847    loop_header:
2848      a1 = phi < a0, a2 >
2849      a3 = ...
2850      a2 = operation (a3, a1)
2851
2852    or
2853
2854    a3 = ...
2855    loop_header:
2856      a1 = phi < a0, a2 >
2857      a2 = operation (a3, a1)
2858
2859    such that:
2860    1. operation is commutative and associative and it is safe to
2861       change the order of the computation
2862    2. no uses for a2 in the loop (a2 is used out of the loop)
2863    3. no uses of a1 in the loop besides the reduction operation
2864    4. no uses of a1 outside the loop.
2865
2866    Conditions 1,4 are tested here.
2867    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2868
2869    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2870    nested cycles.
2871
2872    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2873    reductions:
2874
2875      a1 = phi < a0, a2 >
2876      inner loop (def of a3)
2877      a2 = phi < a3 >
2878
2879    (4) Detect condition expressions, ie:
2880      for (int i = 0; i < N; i++)
2881        if (a[i] < val)
2882         ret_val = a[i];
2883
2884 */
2885
2886 static gimple *
2887 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2888                           bool *double_reduc,
2889                           bool need_wrapping_integral_overflow,
2890                           enum vect_reduction_type *v_reduc_type)
2891 {
2892   struct loop *loop = (gimple_bb (phi))->loop_father;
2893   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2894   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2895   enum tree_code orig_code, code;
2896   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2897   tree type;
2898   int nloop_uses;
2899   tree name;
2900   imm_use_iterator imm_iter;
2901   use_operand_p use_p;
2902   bool phi_def;
2903
2904   *double_reduc = false;
2905   *v_reduc_type = TREE_CODE_REDUCTION;
2906
2907   tree phi_name = PHI_RESULT (phi);
2908   /* ???  If there are no uses of the PHI result the inner loop reduction
2909      won't be detected as possibly double-reduction by vectorizable_reduction
2910      because that tries to walk the PHI arg from the preheader edge which
2911      can be constant.  See PR60382.  */
2912   if (has_zero_uses (phi_name))
2913     return NULL;
2914   nloop_uses = 0;
2915   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2916     {
2917       gimple *use_stmt = USE_STMT (use_p);
2918       if (is_gimple_debug (use_stmt))
2919         continue;
2920
2921       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2922         {
2923           if (dump_enabled_p ())
2924             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2925                              "intermediate value used outside loop.\n");
2926
2927           return NULL;
2928         }
2929
2930       nloop_uses++;
2931       if (nloop_uses > 1)
2932         {
2933           if (dump_enabled_p ())
2934             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2935                              "reduction value used in loop.\n");
2936           return NULL;
2937         }
2938
2939       phi_use_stmt = use_stmt;
2940     }
2941
2942   edge latch_e = loop_latch_edge (loop);
2943   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2944   if (TREE_CODE (loop_arg) != SSA_NAME)
2945     {
2946       if (dump_enabled_p ())
2947         {
2948           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2949                            "reduction: not ssa_name: ");
2950           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2951           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2952         }
2953       return NULL;
2954     }
2955
2956   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2957   if (is_gimple_assign (def_stmt))
2958     {
2959       name = gimple_assign_lhs (def_stmt);
2960       phi_def = false;
2961     }
2962   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2963     {
2964       name = PHI_RESULT (def_stmt);
2965       phi_def = true;
2966     }
2967   else
2968     {
2969       if (dump_enabled_p ())
2970         {
2971           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2972                            "reduction: unhandled reduction operation: ");
2973           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2974         }
2975       return NULL;
2976     }
2977
2978   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2979     return NULL;
2980
2981   nloop_uses = 0;
2982   auto_vec<gphi *, 3> lcphis;
2983   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2984     {
2985       gimple *use_stmt = USE_STMT (use_p);
2986       if (is_gimple_debug (use_stmt))
2987         continue;
2988       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2989         nloop_uses++;
2990       else
2991         /* We can have more than one loop-closed PHI.  */
2992         lcphis.safe_push (as_a <gphi *> (use_stmt));
2993       if (nloop_uses > 1)
2994         {
2995           if (dump_enabled_p ())
2996             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2997                              "reduction used in loop.\n");
2998           return NULL;
2999         }
3000     }
3001
3002   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3003      defined in the inner loop.  */
3004   if (phi_def)
3005     {
3006       op1 = PHI_ARG_DEF (def_stmt, 0);
3007
3008       if (gimple_phi_num_args (def_stmt) != 1
3009           || TREE_CODE (op1) != SSA_NAME)
3010         {
3011           if (dump_enabled_p ())
3012             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3013                              "unsupported phi node definition.\n");
3014
3015           return NULL;
3016         }
3017
3018       def1 = SSA_NAME_DEF_STMT (op1);
3019       if (gimple_bb (def1)
3020           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3021           && loop->inner
3022           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3023           && is_gimple_assign (def1)
3024           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3025         {
3026           if (dump_enabled_p ())
3027             report_vect_op (MSG_NOTE, def_stmt,
3028                             "detected double reduction: ");
3029
3030           *double_reduc = true;
3031           return def_stmt;
3032         }
3033
3034       return NULL;
3035     }
3036
3037   /* If we are vectorizing an inner reduction we are executing that
3038      in the original order only in case we are not dealing with a
3039      double reduction.  */
3040   bool check_reduction = true;
3041   if (flow_loop_nested_p (vect_loop, loop))
3042     {
3043       gphi *lcphi;
3044       unsigned i;
3045       check_reduction = false;
3046       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3047         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3048           {
3049             gimple *use_stmt = USE_STMT (use_p);
3050             if (is_gimple_debug (use_stmt))
3051               continue;
3052             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3053               check_reduction = true;
3054           }
3055     }
3056
3057   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3058   code = orig_code = gimple_assign_rhs_code (def_stmt);
3059
3060   /* We can handle "res -= x[i]", which is non-associative by
3061      simply rewriting this into "res += -x[i]".  Avoid changing
3062      gimple instruction for the first simple tests and only do this
3063      if we're allowed to change code at all.  */
3064   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3065     code = PLUS_EXPR;
3066
3067   if (code == COND_EXPR)
3068     {
3069       if (! nested_in_vect_loop)
3070         *v_reduc_type = COND_REDUCTION;
3071
3072       op3 = gimple_assign_rhs1 (def_stmt);
3073       if (COMPARISON_CLASS_P (op3))
3074         {
3075           op4 = TREE_OPERAND (op3, 1);
3076           op3 = TREE_OPERAND (op3, 0);
3077         }
3078       if (op3 == phi_name || op4 == phi_name)
3079         {
3080           if (dump_enabled_p ())
3081             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3082                             "reduction: condition depends on previous"
3083                             " iteration: ");
3084           return NULL;
3085         }
3086
3087       op1 = gimple_assign_rhs2 (def_stmt);
3088       op2 = gimple_assign_rhs3 (def_stmt);
3089     }
3090   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3091     {
3092       if (dump_enabled_p ())
3093         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3094                         "reduction: not commutative/associative: ");
3095       return NULL;
3096     }
3097   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3098     {
3099       op1 = gimple_assign_rhs1 (def_stmt);
3100       op2 = gimple_assign_rhs2 (def_stmt);
3101     }
3102   else
3103     {
3104       if (dump_enabled_p ())
3105         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3106                         "reduction: not handled operation: ");
3107       return NULL;
3108     }
3109
3110   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3111     {
3112       if (dump_enabled_p ())
3113         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3114                         "reduction: both uses not ssa_names: ");
3115
3116       return NULL;
3117     }
3118
3119   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3120   if ((TREE_CODE (op1) == SSA_NAME
3121        && !types_compatible_p (type,TREE_TYPE (op1)))
3122       || (TREE_CODE (op2) == SSA_NAME
3123           && !types_compatible_p (type, TREE_TYPE (op2)))
3124       || (op3 && TREE_CODE (op3) == SSA_NAME
3125           && !types_compatible_p (type, TREE_TYPE (op3)))
3126       || (op4 && TREE_CODE (op4) == SSA_NAME
3127           && !types_compatible_p (type, TREE_TYPE (op4))))
3128     {
3129       if (dump_enabled_p ())
3130         {
3131           dump_printf_loc (MSG_NOTE, vect_location,
3132                            "reduction: multiple types: operation type: ");
3133           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3134           dump_printf (MSG_NOTE, ", operands types: ");
3135           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3136                              TREE_TYPE (op1));
3137           dump_printf (MSG_NOTE, ",");
3138           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3139                              TREE_TYPE (op2));
3140           if (op3)
3141             {
3142               dump_printf (MSG_NOTE, ",");
3143               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3144                                  TREE_TYPE (op3));
3145             }
3146
3147           if (op4)
3148             {
3149               dump_printf (MSG_NOTE, ",");
3150               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3151                                  TREE_TYPE (op4));
3152             }
3153           dump_printf (MSG_NOTE, "\n");
3154         }
3155
3156       return NULL;
3157     }
3158
3159   /* Check whether it's ok to change the order of the computation.
3160      Generally, when vectorizing a reduction we change the order of the
3161      computation.  This may change the behavior of the program in some
3162      cases, so we need to check that this is ok.  One exception is when
3163      vectorizing an outer-loop: the inner-loop is executed sequentially,
3164      and therefore vectorizing reductions in the inner-loop during
3165      outer-loop vectorization is safe.  */
3166   if (check_reduction
3167       && *v_reduc_type == TREE_CODE_REDUCTION
3168       && needs_fold_left_reduction_p (type, code,
3169                                       need_wrapping_integral_overflow))
3170     *v_reduc_type = FOLD_LEFT_REDUCTION;
3171
3172   /* Reduction is safe. We're dealing with one of the following:
3173      1) integer arithmetic and no trapv
3174      2) floating point arithmetic, and special flags permit this optimization
3175      3) nested cycle (i.e., outer loop vectorization).  */
3176   if (TREE_CODE (op1) == SSA_NAME)
3177     def1 = SSA_NAME_DEF_STMT (op1);
3178
3179   if (TREE_CODE (op2) == SSA_NAME)
3180     def2 = SSA_NAME_DEF_STMT (op2);
3181
3182   if (code != COND_EXPR
3183       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3184     {
3185       if (dump_enabled_p ())
3186         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3187       return NULL;
3188     }
3189
3190   /* Check that one def is the reduction def, defined by PHI,
3191      the other def is either defined in the loop ("vect_internal_def"),
3192      or it's an induction (defined by a loop-header phi-node).  */
3193
3194   if (def2 && def2 == phi
3195       && (code == COND_EXPR
3196           || !def1 || gimple_nop_p (def1)
3197           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3198           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3199               && (is_gimple_assign (def1)
3200                   || is_gimple_call (def1)
3201                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3202                       == vect_induction_def
3203                   || (gimple_code (def1) == GIMPLE_PHI
3204                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3205                           == vect_internal_def
3206                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3207     {
3208       if (dump_enabled_p ())
3209         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3210       return def_stmt;
3211     }
3212
3213   if (def1 && def1 == phi
3214       && (code == COND_EXPR
3215           || !def2 || gimple_nop_p (def2)
3216           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3217           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3218               && (is_gimple_assign (def2)
3219                   || is_gimple_call (def2)
3220                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3221                        == vect_induction_def
3222                   || (gimple_code (def2) == GIMPLE_PHI
3223                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3224                            == vect_internal_def
3225                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3226     {
3227       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3228         {
3229           /* Check if we can swap operands (just for simplicity - so that
3230              the rest of the code can assume that the reduction variable
3231              is always the last (second) argument).  */
3232           if (code == COND_EXPR)
3233             {
3234               /* Swap cond_expr by inverting the condition.  */
3235               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3236               enum tree_code invert_code = ERROR_MARK;
3237               enum tree_code cond_code = TREE_CODE (cond_expr);
3238
3239               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3240                 {
3241                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3242                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3243                 }
3244               if (invert_code != ERROR_MARK)
3245                 {
3246                   TREE_SET_CODE (cond_expr, invert_code);
3247                   swap_ssa_operands (def_stmt,
3248                                      gimple_assign_rhs2_ptr (def_stmt),
3249                                      gimple_assign_rhs3_ptr (def_stmt));
3250                 }
3251               else
3252                 {
3253                   if (dump_enabled_p ())
3254                     report_vect_op (MSG_NOTE, def_stmt,
3255                                     "detected reduction: cannot swap operands "
3256                                     "for cond_expr");
3257                   return NULL;
3258                 }
3259             }
3260           else
3261             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3262                                gimple_assign_rhs2_ptr (def_stmt));
3263
3264           if (dump_enabled_p ())
3265             report_vect_op (MSG_NOTE, def_stmt,
3266                             "detected reduction: need to swap operands: ");
3267
3268           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3269             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3270         }
3271       else
3272         {
3273           if (dump_enabled_p ())
3274             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3275         }
3276
3277       return def_stmt;
3278     }
3279
3280   /* Try to find SLP reduction chain.  */
3281   if (! nested_in_vect_loop
3282       && code != COND_EXPR
3283       && orig_code != MINUS_EXPR
3284       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3285     {
3286       if (dump_enabled_p ())
3287         report_vect_op (MSG_NOTE, def_stmt,
3288                         "reduction: detected reduction chain: ");
3289
3290       return def_stmt;
3291     }
3292
3293   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3294   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3295   while (first)
3296     {
3297       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3298       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3299       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3300       first = next;
3301     }
3302
3303   /* Look for the expression computing loop_arg from loop PHI result.  */
3304   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3305                             code))
3306     return def_stmt;
3307
3308   if (dump_enabled_p ())
3309     {
3310       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3311                       "reduction: unknown pattern: ");
3312     }
3313
3314   return NULL;
3315 }
3316
3317 /* Wrapper around vect_is_simple_reduction, which will modify code
3318    in-place if it enables detection of more reductions.  Arguments
3319    as there.  */
3320
3321 gimple *
3322 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3323                              bool *double_reduc,
3324                              bool need_wrapping_integral_overflow)
3325 {
3326   enum vect_reduction_type v_reduc_type;
3327   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3328                                           need_wrapping_integral_overflow,
3329                                           &v_reduc_type);
3330   if (def)
3331     {
3332       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3333       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3334       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3335       reduc_def_info = vinfo_for_stmt (def);
3336       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3337       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3338     }
3339   return def;
3340 }
3341
3342 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3343 int
3344 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3345                              int *peel_iters_epilogue,
3346                              stmt_vector_for_cost *scalar_cost_vec,
3347                              stmt_vector_for_cost *prologue_cost_vec,
3348                              stmt_vector_for_cost *epilogue_cost_vec)
3349 {
3350   int retval = 0;
3351   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3352
3353   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3354     {
3355       *peel_iters_epilogue = assumed_vf / 2;
3356       if (dump_enabled_p ())
3357         dump_printf_loc (MSG_NOTE, vect_location,
3358                          "cost model: epilogue peel iters set to vf/2 "
3359                          "because loop iterations are unknown .\n");
3360
3361       /* If peeled iterations are known but number of scalar loop
3362          iterations are unknown, count a taken branch per peeled loop.  */
3363       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3364                                  NULL, 0, vect_prologue);
3365       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3366                                  NULL, 0, vect_epilogue);
3367     }
3368   else
3369     {
3370       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3371       peel_iters_prologue = niters < peel_iters_prologue ?
3372                             niters : peel_iters_prologue;
3373       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3374       /* If we need to peel for gaps, but no peeling is required, we have to
3375          peel VF iterations.  */
3376       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3377         *peel_iters_epilogue = assumed_vf;
3378     }
3379
3380   stmt_info_for_cost *si;
3381   int j;
3382   if (peel_iters_prologue)
3383     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3384         {
3385           stmt_vec_info stmt_info
3386             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3387           retval += record_stmt_cost (prologue_cost_vec,
3388                                       si->count * peel_iters_prologue,
3389                                       si->kind, stmt_info, si->misalign,
3390                                       vect_prologue);
3391         }
3392   if (*peel_iters_epilogue)
3393     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3394         {
3395           stmt_vec_info stmt_info
3396             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3397           retval += record_stmt_cost (epilogue_cost_vec,
3398                                       si->count * *peel_iters_epilogue,
3399                                       si->kind, stmt_info, si->misalign,
3400                                       vect_epilogue);
3401         }
3402
3403   return retval;
3404 }
3405
3406 /* Function vect_estimate_min_profitable_iters
3407
3408    Return the number of iterations required for the vector version of the
3409    loop to be profitable relative to the cost of the scalar version of the
3410    loop.
3411
3412    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3413    of iterations for vectorization.  -1 value means loop vectorization
3414    is not profitable.  This returned value may be used for dynamic
3415    profitability check.
3416
3417    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3418    for static check against estimated number of iterations.  */
3419
3420 static void
3421 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3422                                     int *ret_min_profitable_niters,
3423                                     int *ret_min_profitable_estimate)
3424 {
3425   int min_profitable_iters;
3426   int min_profitable_estimate;
3427   int peel_iters_prologue;
3428   int peel_iters_epilogue;
3429   unsigned vec_inside_cost = 0;
3430   int vec_outside_cost = 0;
3431   unsigned vec_prologue_cost = 0;
3432   unsigned vec_epilogue_cost = 0;
3433   int scalar_single_iter_cost = 0;
3434   int scalar_outside_cost = 0;
3435   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3436   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3437   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3438
3439   /* Cost model disabled.  */
3440   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3441     {
3442       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3443       *ret_min_profitable_niters = 0;
3444       *ret_min_profitable_estimate = 0;
3445       return;
3446     }
3447
3448   /* Requires loop versioning tests to handle misalignment.  */
3449   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3450     {
3451       /*  FIXME: Make cost depend on complexity of individual check.  */
3452       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3453       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3454                             vect_prologue);
3455       dump_printf (MSG_NOTE,
3456                    "cost model: Adding cost of checks for loop "
3457                    "versioning to treat misalignment.\n");
3458     }
3459
3460   /* Requires loop versioning with alias checks.  */
3461   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3462     {
3463       /*  FIXME: Make cost depend on complexity of individual check.  */
3464       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3465       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3466                             vect_prologue);
3467       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3468       if (len)
3469         /* Count LEN - 1 ANDs and LEN comparisons.  */
3470         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3471                               NULL, 0, vect_prologue);
3472       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3473       if (len)
3474         {
3475           /* Count LEN - 1 ANDs and LEN comparisons.  */
3476           unsigned int nstmts = len * 2 - 1;
3477           /* +1 for each bias that needs adding.  */
3478           for (unsigned int i = 0; i < len; ++i)
3479             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3480               nstmts += 1;
3481           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3482                                 NULL, 0, vect_prologue);
3483         }
3484       dump_printf (MSG_NOTE,
3485                    "cost model: Adding cost of checks for loop "
3486                    "versioning aliasing.\n");
3487     }
3488
3489   /* Requires loop versioning with niter checks.  */
3490   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3491     {
3492       /*  FIXME: Make cost depend on complexity of individual check.  */
3493       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3494                             vect_prologue);
3495       dump_printf (MSG_NOTE,
3496                    "cost model: Adding cost of checks for loop "
3497                    "versioning niters.\n");
3498     }
3499
3500   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3501     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3502                           vect_prologue);
3503
3504   /* Count statements in scalar loop.  Using this as scalar cost for a single
3505      iteration for now.
3506
3507      TODO: Add outer loop support.
3508
3509      TODO: Consider assigning different costs to different scalar
3510      statements.  */
3511
3512   scalar_single_iter_cost
3513     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3514
3515   /* Add additional cost for the peeled instructions in prologue and epilogue
3516      loop.  (For fully-masked loops there will be no peeling.)
3517
3518      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3519      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3520
3521      TODO: Build an expression that represents peel_iters for prologue and
3522      epilogue to be used in a run-time test.  */
3523
3524   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3525     {
3526       peel_iters_prologue = 0;
3527       peel_iters_epilogue = 0;
3528
3529       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3530         {
3531           /* We need to peel exactly one iteration.  */
3532           peel_iters_epilogue += 1;
3533           stmt_info_for_cost *si;
3534           int j;
3535           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3536                             j, si)
3537             {
3538               struct _stmt_vec_info *stmt_info
3539                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3540               (void) add_stmt_cost (target_cost_data, si->count,
3541                                     si->kind, stmt_info, si->misalign,
3542                                     vect_epilogue);
3543             }
3544         }
3545     }
3546   else if (npeel < 0)
3547     {
3548       peel_iters_prologue = assumed_vf / 2;
3549       dump_printf (MSG_NOTE, "cost model: "
3550                    "prologue peel iters set to vf/2.\n");
3551
3552       /* If peeling for alignment is unknown, loop bound of main loop becomes
3553          unknown.  */
3554       peel_iters_epilogue = assumed_vf / 2;
3555       dump_printf (MSG_NOTE, "cost model: "
3556                    "epilogue peel iters set to vf/2 because "
3557                    "peeling for alignment is unknown.\n");
3558
3559       /* If peeled iterations are unknown, count a taken branch and a not taken
3560          branch per peeled loop. Even if scalar loop iterations are known,
3561          vector iterations are not known since peeled prologue iterations are
3562          not known. Hence guards remain the same.  */
3563       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3564                             NULL, 0, vect_prologue);
3565       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3566                             NULL, 0, vect_prologue);
3567       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3568                             NULL, 0, vect_epilogue);
3569       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3570                             NULL, 0, vect_epilogue);
3571       stmt_info_for_cost *si;
3572       int j;
3573       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3574         {
3575           struct _stmt_vec_info *stmt_info
3576             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3577           (void) add_stmt_cost (target_cost_data,
3578                                 si->count * peel_iters_prologue,
3579                                 si->kind, stmt_info, si->misalign,
3580                                 vect_prologue);
3581           (void) add_stmt_cost (target_cost_data,
3582                                 si->count * peel_iters_epilogue,
3583                                 si->kind, stmt_info, si->misalign,
3584                                 vect_epilogue);
3585         }
3586     }
3587   else
3588     {
3589       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3590       stmt_info_for_cost *si;
3591       int j;
3592       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3593
3594       prologue_cost_vec.create (2);
3595       epilogue_cost_vec.create (2);
3596       peel_iters_prologue = npeel;
3597
3598       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3599                                           &peel_iters_epilogue,
3600                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3601                                             (loop_vinfo),
3602                                           &prologue_cost_vec,
3603                                           &epilogue_cost_vec);
3604
3605       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3606         {
3607           struct _stmt_vec_info *stmt_info
3608             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3609           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3610                                 si->misalign, vect_prologue);
3611         }
3612
3613       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3614         {
3615           struct _stmt_vec_info *stmt_info
3616             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3617           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3618                                 si->misalign, vect_epilogue);
3619         }
3620
3621       prologue_cost_vec.release ();
3622       epilogue_cost_vec.release ();
3623     }
3624
3625   /* FORNOW: The scalar outside cost is incremented in one of the
3626      following ways:
3627
3628      1. The vectorizer checks for alignment and aliasing and generates
3629      a condition that allows dynamic vectorization.  A cost model
3630      check is ANDED with the versioning condition.  Hence scalar code
3631      path now has the added cost of the versioning check.
3632
3633        if (cost > th & versioning_check)
3634          jmp to vector code
3635
3636      Hence run-time scalar is incremented by not-taken branch cost.
3637
3638      2. The vectorizer then checks if a prologue is required.  If the
3639      cost model check was not done before during versioning, it has to
3640      be done before the prologue check.
3641
3642        if (cost <= th)
3643          prologue = scalar_iters
3644        if (prologue == 0)
3645          jmp to vector code
3646        else
3647          execute prologue
3648        if (prologue == num_iters)
3649          go to exit
3650
3651      Hence the run-time scalar cost is incremented by a taken branch,
3652      plus a not-taken branch, plus a taken branch cost.
3653
3654      3. The vectorizer then checks if an epilogue is required.  If the
3655      cost model check was not done before during prologue check, it
3656      has to be done with the epilogue check.
3657
3658        if (prologue == 0)
3659          jmp to vector code
3660        else
3661          execute prologue
3662        if (prologue == num_iters)
3663          go to exit
3664        vector code:
3665          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3666            jmp to epilogue
3667
3668      Hence the run-time scalar cost should be incremented by 2 taken
3669      branches.
3670
3671      TODO: The back end may reorder the BBS's differently and reverse
3672      conditions/branch directions.  Change the estimates below to
3673      something more reasonable.  */
3674
3675   /* If the number of iterations is known and we do not do versioning, we can
3676      decide whether to vectorize at compile time.  Hence the scalar version
3677      do not carry cost model guard costs.  */
3678   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3679       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3680     {
3681       /* Cost model check occurs at versioning.  */
3682       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3683         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3684       else
3685         {
3686           /* Cost model check occurs at prologue generation.  */
3687           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3688             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3689               + vect_get_stmt_cost (cond_branch_not_taken);
3690           /* Cost model check occurs at epilogue generation.  */
3691           else
3692             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3693         }
3694     }
3695
3696   /* Complete the target-specific cost calculations.  */
3697   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3698                &vec_inside_cost, &vec_epilogue_cost);
3699
3700   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3701
3702   if (dump_enabled_p ())
3703     {
3704       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3705       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3706                    vec_inside_cost);
3707       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3708                    vec_prologue_cost);
3709       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3710                    vec_epilogue_cost);
3711       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3712                    scalar_single_iter_cost);
3713       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3714                    scalar_outside_cost);
3715       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3716                    vec_outside_cost);
3717       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3718                    peel_iters_prologue);
3719       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3720                    peel_iters_epilogue);
3721     }
3722
3723   /* Calculate number of iterations required to make the vector version
3724      profitable, relative to the loop bodies only.  The following condition
3725      must hold true:
3726      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3727      where
3728      SIC = scalar iteration cost, VIC = vector iteration cost,
3729      VOC = vector outside cost, VF = vectorization factor,
3730      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3731      SOC = scalar outside cost for run time cost model check.  */
3732
3733   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3734     {
3735       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3736                               * assumed_vf
3737                               - vec_inside_cost * peel_iters_prologue
3738                               - vec_inside_cost * peel_iters_epilogue);
3739       if (min_profitable_iters <= 0)
3740         min_profitable_iters = 0;
3741       else
3742         {
3743           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3744                                    - vec_inside_cost);
3745
3746           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3747               <= (((int) vec_inside_cost * min_profitable_iters)
3748                   + (((int) vec_outside_cost - scalar_outside_cost)
3749                      * assumed_vf)))
3750             min_profitable_iters++;
3751         }
3752     }
3753   /* vector version will never be profitable.  */
3754   else
3755     {
3756       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3757         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3758                     "did not happen for a simd loop");
3759
3760       if (dump_enabled_p ())
3761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3762                          "cost model: the vector iteration cost = %d "
3763                          "divided by the scalar iteration cost = %d "
3764                          "is greater or equal to the vectorization factor = %d"
3765                          ".\n",
3766                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3767       *ret_min_profitable_niters = -1;
3768       *ret_min_profitable_estimate = -1;
3769       return;
3770     }
3771
3772   dump_printf (MSG_NOTE,
3773                "  Calculated minimum iters for profitability: %d\n",
3774                min_profitable_iters);
3775
3776   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3777       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3778     /* We want the vectorized loop to execute at least once.  */
3779     min_profitable_iters = assumed_vf + peel_iters_prologue;
3780
3781   if (dump_enabled_p ())
3782     dump_printf_loc (MSG_NOTE, vect_location,
3783                      "  Runtime profitability threshold = %d\n",
3784                      min_profitable_iters);
3785
3786   *ret_min_profitable_niters = min_profitable_iters;
3787
3788   /* Calculate number of iterations required to make the vector version
3789      profitable, relative to the loop bodies only.
3790
3791      Non-vectorized variant is SIC * niters and it must win over vector
3792      variant on the expected loop trip count.  The following condition must hold true:
3793      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3794
3795   if (vec_outside_cost <= 0)
3796     min_profitable_estimate = 0;
3797   else
3798     {
3799       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3800                                  * assumed_vf
3801                                  - vec_inside_cost * peel_iters_prologue
3802                                  - vec_inside_cost * peel_iters_epilogue)
3803                                  / ((scalar_single_iter_cost * assumed_vf)
3804                                    - vec_inside_cost);
3805     }
3806   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3807   if (dump_enabled_p ())
3808     dump_printf_loc (MSG_NOTE, vect_location,
3809                      "  Static estimate profitability threshold = %d\n",
3810                      min_profitable_estimate);
3811
3812   *ret_min_profitable_estimate = min_profitable_estimate;
3813 }
3814
3815 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3816    vector elements (not bits) for a vector with NELT elements.  */
3817 static void
3818 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3819                               vec_perm_builder *sel)
3820 {
3821   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3822      by vec_perm_indices.  */
3823   sel->new_vector (nelt, 1, 3);
3824   for (unsigned int i = 0; i < 3; i++)
3825     sel->quick_push (i + offset);
3826 }
3827
3828 /* Checks whether the target supports whole-vector shifts for vectors of mode
3829    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3830    it supports vec_perm_const with masks for all necessary shift amounts.  */
3831 static bool
3832 have_whole_vector_shift (machine_mode mode)
3833 {
3834   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3835     return true;
3836
3837   /* Variable-length vectors should be handled via the optab.  */
3838   unsigned int nelt;
3839   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3840     return false;
3841
3842   vec_perm_builder sel;
3843   vec_perm_indices indices;
3844   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3845     {
3846       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3847       indices.new_vector (sel, 2, nelt);
3848       if (!can_vec_perm_const_p (mode, indices, false))
3849         return false;
3850     }
3851   return true;
3852 }
3853
3854 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3855    functions. Design better to avoid maintenance issues.  */
3856
3857 /* Function vect_model_reduction_cost.
3858
3859    Models cost for a reduction operation, including the vector ops
3860    generated within the strip-mine loop, the initial definition before
3861    the loop, and the epilogue code that must be generated.  */
3862
3863 static void
3864 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3865                            int ncopies, stmt_vector_for_cost *cost_vec)
3866 {
3867   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3868   enum tree_code code;
3869   optab optab;
3870   tree vectype;
3871   gimple *orig_stmt;
3872   machine_mode mode;
3873   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3874   struct loop *loop = NULL;
3875
3876   if (loop_vinfo)
3877     loop = LOOP_VINFO_LOOP (loop_vinfo);
3878
3879   /* Condition reductions generate two reductions in the loop.  */
3880   vect_reduction_type reduction_type
3881     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3882   if (reduction_type == COND_REDUCTION)
3883     ncopies *= 2;
3884
3885   vectype = STMT_VINFO_VECTYPE (stmt_info);
3886   mode = TYPE_MODE (vectype);
3887   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3888
3889   if (!orig_stmt)
3890     orig_stmt = STMT_VINFO_STMT (stmt_info);
3891
3892   code = gimple_assign_rhs_code (orig_stmt);
3893
3894   if (reduction_type == EXTRACT_LAST_REDUCTION
3895       || reduction_type == FOLD_LEFT_REDUCTION)
3896     {
3897       /* No extra instructions needed in the prologue.  */
3898       prologue_cost = 0;
3899
3900       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3901         /* Count one reduction-like operation per vector.  */
3902         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3903                                         stmt_info, 0, vect_body);
3904       else
3905         {
3906           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3907           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3908           inside_cost = record_stmt_cost (cost_vec, nelements,
3909                                           vec_to_scalar, stmt_info, 0,
3910                                           vect_body);
3911           inside_cost += record_stmt_cost (cost_vec, nelements,
3912                                            scalar_stmt, stmt_info, 0,
3913                                            vect_body);
3914         }
3915     }
3916   else
3917     {
3918       /* Add in cost for initial definition.
3919          For cond reduction we have four vectors: initial index, step,
3920          initial result of the data reduction, initial value of the index
3921          reduction.  */
3922       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3923       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3924                                          scalar_to_vec, stmt_info, 0,
3925                                          vect_prologue);
3926
3927       /* Cost of reduction op inside loop.  */
3928       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3929                                       stmt_info, 0, vect_body);
3930     }
3931
3932   /* Determine cost of epilogue code.
3933
3934      We have a reduction operator that will reduce the vector in one statement.
3935      Also requires scalar extract.  */
3936
3937   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3938     {
3939       if (reduc_fn != IFN_LAST)
3940         {
3941           if (reduction_type == COND_REDUCTION)
3942             {
3943               /* An EQ stmt and an COND_EXPR stmt.  */
3944               epilogue_cost += record_stmt_cost (cost_vec, 2,
3945                                                  vector_stmt, stmt_info, 0,
3946                                                  vect_epilogue);
3947               /* Reduction of the max index and a reduction of the found
3948                  values.  */
3949               epilogue_cost += record_stmt_cost (cost_vec, 2,
3950                                                  vec_to_scalar, stmt_info, 0,
3951                                                  vect_epilogue);
3952               /* A broadcast of the max value.  */
3953               epilogue_cost += record_stmt_cost (cost_vec, 1,
3954                                                  scalar_to_vec, stmt_info, 0,
3955                                                  vect_epilogue);
3956             }
3957           else
3958             {
3959               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3960                                                  stmt_info, 0, vect_epilogue);
3961               epilogue_cost += record_stmt_cost (cost_vec, 1,
3962                                                  vec_to_scalar, stmt_info, 0,
3963                                                  vect_epilogue);
3964             }
3965         }
3966       else if (reduction_type == COND_REDUCTION)
3967         {
3968           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3969           /* Extraction of scalar elements.  */
3970           epilogue_cost += record_stmt_cost (cost_vec,
3971                                              2 * estimated_nunits,
3972                                              vec_to_scalar, stmt_info, 0,
3973                                              vect_epilogue);
3974           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3975           epilogue_cost += record_stmt_cost (cost_vec,
3976                                              2 * estimated_nunits - 3,
3977                                              scalar_stmt, stmt_info, 0,
3978                                              vect_epilogue);
3979         }
3980       else if (reduction_type == EXTRACT_LAST_REDUCTION
3981                || reduction_type == FOLD_LEFT_REDUCTION)
3982         /* No extra instructions need in the epilogue.  */
3983         ;
3984       else
3985         {
3986           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3987           tree bitsize =
3988             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3989           int element_bitsize = tree_to_uhwi (bitsize);
3990           int nelements = vec_size_in_bits / element_bitsize;
3991
3992           if (code == COND_EXPR)
3993             code = MAX_EXPR;
3994
3995           optab = optab_for_tree_code (code, vectype, optab_default);
3996
3997           /* We have a whole vector shift available.  */
3998           if (optab != unknown_optab
3999               && VECTOR_MODE_P (mode)
4000               && optab_handler (optab, mode) != CODE_FOR_nothing
4001               && have_whole_vector_shift (mode))
4002             {
4003               /* Final reduction via vector shifts and the reduction operator.
4004                  Also requires scalar extract.  */
4005               epilogue_cost += record_stmt_cost (cost_vec,
4006                                                  exact_log2 (nelements) * 2,
4007                                                  vector_stmt, stmt_info, 0,
4008                                                  vect_epilogue);
4009               epilogue_cost += record_stmt_cost (cost_vec, 1,
4010                                                  vec_to_scalar, stmt_info, 0,
4011                                                  vect_epilogue);
4012             }
4013           else
4014             /* Use extracts and reduction op for final reduction.  For N
4015                elements, we have N extracts and N-1 reduction ops.  */
4016             epilogue_cost += record_stmt_cost (cost_vec,
4017                                                nelements + nelements - 1,
4018                                                vector_stmt, stmt_info, 0,
4019                                                vect_epilogue);
4020         }
4021     }
4022
4023   if (dump_enabled_p ())
4024     dump_printf (MSG_NOTE,
4025                  "vect_model_reduction_cost: inside_cost = %d, "
4026                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4027                  prologue_cost, epilogue_cost);
4028 }
4029
4030
4031 /* Function vect_model_induction_cost.
4032
4033    Models cost for induction operations.  */
4034
4035 static void
4036 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4037                            stmt_vector_for_cost *cost_vec)
4038 {
4039   unsigned inside_cost, prologue_cost;
4040
4041   if (PURE_SLP_STMT (stmt_info))
4042     return;
4043
4044   /* loop cost for vec_loop.  */
4045   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4046                                   stmt_info, 0, vect_body);
4047
4048   /* prologue cost for vec_init and vec_step.  */
4049   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4050                                     stmt_info, 0, vect_prologue);
4051
4052   if (dump_enabled_p ())
4053     dump_printf_loc (MSG_NOTE, vect_location,
4054                      "vect_model_induction_cost: inside_cost = %d, "
4055                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4056 }
4057
4058
4059
4060 /* Function get_initial_def_for_reduction
4061
4062    Input:
4063    STMT - a stmt that performs a reduction operation in the loop.
4064    INIT_VAL - the initial value of the reduction variable
4065
4066    Output:
4067    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4068         of the reduction (used for adjusting the epilog - see below).
4069    Return a vector variable, initialized according to the operation that STMT
4070         performs. This vector will be used as the initial value of the
4071         vector of partial results.
4072
4073    Option1 (adjust in epilog): Initialize the vector as follows:
4074      add/bit or/xor:    [0,0,...,0,0]
4075      mult/bit and:      [1,1,...,1,1]
4076      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4077    and when necessary (e.g. add/mult case) let the caller know
4078    that it needs to adjust the result by init_val.
4079
4080    Option2: Initialize the vector as follows:
4081      add/bit or/xor:    [init_val,0,0,...,0]
4082      mult/bit and:      [init_val,1,1,...,1]
4083      min/max/cond_expr: [init_val,init_val,...,init_val]
4084    and no adjustments are needed.
4085
4086    For example, for the following code:
4087
4088    s = init_val;
4089    for (i=0;i<n;i++)
4090      s = s + a[i];
4091
4092    STMT is 's = s + a[i]', and the reduction variable is 's'.
4093    For a vector of 4 units, we want to return either [0,0,0,init_val],
4094    or [0,0,0,0] and let the caller know that it needs to adjust
4095    the result at the end by 'init_val'.
4096
4097    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4098    initialization vector is simpler (same element in all entries), if
4099    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4100
4101    A cost model should help decide between these two schemes.  */
4102
4103 tree
4104 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4105                                tree *adjustment_def)
4106 {
4107   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4108   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4109   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4110   tree scalar_type = TREE_TYPE (init_val);
4111   tree vectype = get_vectype_for_scalar_type (scalar_type);
4112   enum tree_code code = gimple_assign_rhs_code (stmt);
4113   tree def_for_init;
4114   tree init_def;
4115   bool nested_in_vect_loop = false;
4116   REAL_VALUE_TYPE real_init_val = dconst0;
4117   int int_init_val = 0;
4118   gimple *def_stmt = NULL;
4119   gimple_seq stmts = NULL;
4120
4121   gcc_assert (vectype);
4122
4123   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4124               || SCALAR_FLOAT_TYPE_P (scalar_type));
4125
4126   if (nested_in_vect_loop_p (loop, stmt))
4127     nested_in_vect_loop = true;
4128   else
4129     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4130
4131   /* In case of double reduction we only create a vector variable to be put
4132      in the reduction phi node.  The actual statement creation is done in
4133      vect_create_epilog_for_reduction.  */
4134   if (adjustment_def && nested_in_vect_loop
4135       && TREE_CODE (init_val) == SSA_NAME
4136       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4137       && gimple_code (def_stmt) == GIMPLE_PHI
4138       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4139       && vinfo_for_stmt (def_stmt)
4140       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4141           == vect_double_reduction_def)
4142     {
4143       *adjustment_def = NULL;
4144       return vect_create_destination_var (init_val, vectype);
4145     }
4146
4147   vect_reduction_type reduction_type
4148     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4149
4150   /* In case of a nested reduction do not use an adjustment def as
4151      that case is not supported by the epilogue generation correctly
4152      if ncopies is not one.  */
4153   if (adjustment_def && nested_in_vect_loop)
4154     {
4155       *adjustment_def = NULL;
4156       return vect_get_vec_def_for_operand (init_val, stmt);
4157     }
4158
4159   switch (code)
4160     {
4161     case WIDEN_SUM_EXPR:
4162     case DOT_PROD_EXPR:
4163     case SAD_EXPR:
4164     case PLUS_EXPR:
4165     case MINUS_EXPR:
4166     case BIT_IOR_EXPR:
4167     case BIT_XOR_EXPR:
4168     case MULT_EXPR:
4169     case BIT_AND_EXPR:
4170       {
4171         /* ADJUSTMENT_DEF is NULL when called from
4172            vect_create_epilog_for_reduction to vectorize double reduction.  */
4173         if (adjustment_def)
4174           *adjustment_def = init_val;
4175
4176         if (code == MULT_EXPR)
4177           {
4178             real_init_val = dconst1;
4179             int_init_val = 1;
4180           }
4181
4182         if (code == BIT_AND_EXPR)
4183           int_init_val = -1;
4184
4185         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4186           def_for_init = build_real (scalar_type, real_init_val);
4187         else
4188           def_for_init = build_int_cst (scalar_type, int_init_val);
4189
4190         if (adjustment_def)
4191           /* Option1: the first element is '0' or '1' as well.  */
4192           init_def = gimple_build_vector_from_val (&stmts, vectype,
4193                                                    def_for_init);
4194         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4195           {
4196             /* Option2 (variable length): the first element is INIT_VAL.  */
4197             init_def = gimple_build_vector_from_val (&stmts, vectype,
4198                                                      def_for_init);
4199             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4200                                      vectype, init_def, init_val);
4201           }
4202         else
4203           {
4204             /* Option2: the first element is INIT_VAL.  */
4205             tree_vector_builder elts (vectype, 1, 2);
4206             elts.quick_push (init_val);
4207             elts.quick_push (def_for_init);
4208             init_def = gimple_build_vector (&stmts, &elts);
4209           }
4210       }
4211       break;
4212
4213     case MIN_EXPR:
4214     case MAX_EXPR:
4215     case COND_EXPR:
4216       {
4217         if (adjustment_def)
4218           {
4219             *adjustment_def = NULL_TREE;
4220             if (reduction_type != COND_REDUCTION
4221                 && reduction_type != EXTRACT_LAST_REDUCTION)
4222               {
4223                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4224                 break;
4225               }
4226           }
4227         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4228         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4229       }
4230       break;
4231
4232     default:
4233       gcc_unreachable ();
4234     }
4235
4236   if (stmts)
4237     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4238   return init_def;
4239 }
4240
4241 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4242    NUMBER_OF_VECTORS is the number of vector defs to create.
4243    If NEUTRAL_OP is nonnull, introducing extra elements of that
4244    value will not change the result.  */
4245
4246 static void
4247 get_initial_defs_for_reduction (slp_tree slp_node,
4248                                 vec<tree> *vec_oprnds,
4249                                 unsigned int number_of_vectors,
4250                                 bool reduc_chain, tree neutral_op)
4251 {
4252   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4253   gimple *stmt = stmts[0];
4254   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4255   unsigned HOST_WIDE_INT nunits;
4256   unsigned j, number_of_places_left_in_vector;
4257   tree vector_type;
4258   tree vop;
4259   int group_size = stmts.length ();
4260   unsigned int vec_num, i;
4261   unsigned number_of_copies = 1;
4262   vec<tree> voprnds;
4263   voprnds.create (number_of_vectors);
4264   struct loop *loop;
4265   auto_vec<tree, 16> permute_results;
4266
4267   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4268
4269   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4270
4271   loop = (gimple_bb (stmt))->loop_father;
4272   gcc_assert (loop);
4273   edge pe = loop_preheader_edge (loop);
4274
4275   gcc_assert (!reduc_chain || neutral_op);
4276
4277   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4278      created vectors. It is greater than 1 if unrolling is performed.
4279
4280      For example, we have two scalar operands, s1 and s2 (e.g., group of
4281      strided accesses of size two), while NUNITS is four (i.e., four scalars
4282      of this type can be packed in a vector).  The output vector will contain
4283      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4284      will be 2).
4285
4286      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4287      vectors containing the operands.
4288
4289      For example, NUNITS is four as before, and the group size is 8
4290      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4291      {s5, s6, s7, s8}.  */
4292
4293   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4294     nunits = group_size;
4295
4296   number_of_copies = nunits * number_of_vectors / group_size;
4297
4298   number_of_places_left_in_vector = nunits;
4299   bool constant_p = true;
4300   tree_vector_builder elts (vector_type, nunits, 1);
4301   elts.quick_grow (nunits);
4302   for (j = 0; j < number_of_copies; j++)
4303     {
4304       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4305         {
4306           tree op;
4307           /* Get the def before the loop.  In reduction chain we have only
4308              one initial value.  */
4309           if ((j != (number_of_copies - 1)
4310                || (reduc_chain && i != 0))
4311               && neutral_op)
4312             op = neutral_op;
4313           else
4314             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4315
4316           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4317           number_of_places_left_in_vector--;
4318           elts[number_of_places_left_in_vector] = op;
4319           if (!CONSTANT_CLASS_P (op))
4320             constant_p = false;
4321
4322           if (number_of_places_left_in_vector == 0)
4323             {
4324               gimple_seq ctor_seq = NULL;
4325               tree init;
4326               if (constant_p && !neutral_op
4327                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4328                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4329                 /* Build the vector directly from ELTS.  */
4330                 init = gimple_build_vector (&ctor_seq, &elts);
4331               else if (neutral_op)
4332                 {
4333                   /* Build a vector of the neutral value and shift the
4334                      other elements into place.  */
4335                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4336                                                        neutral_op);
4337                   int k = nunits;
4338                   while (k > 0 && elts[k - 1] == neutral_op)
4339                     k -= 1;
4340                   while (k > 0)
4341                     {
4342                       k -= 1;
4343                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4344                                            vector_type, init, elts[k]);
4345                     }
4346                 }
4347               else
4348                 {
4349                   /* First time round, duplicate ELTS to fill the
4350                      required number of vectors, then cherry pick the
4351                      appropriate result for each iteration.  */
4352                   if (vec_oprnds->is_empty ())
4353                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4354                                               number_of_vectors,
4355                                               permute_results);
4356                   init = permute_results[number_of_vectors - j - 1];
4357                 }
4358               if (ctor_seq != NULL)
4359                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4360               voprnds.quick_push (init);
4361
4362               number_of_places_left_in_vector = nunits;
4363               elts.new_vector (vector_type, nunits, 1);
4364               elts.quick_grow (nunits);
4365               constant_p = true;
4366             }
4367         }
4368     }
4369
4370   /* Since the vectors are created in the reverse order, we should invert
4371      them.  */
4372   vec_num = voprnds.length ();
4373   for (j = vec_num; j != 0; j--)
4374     {
4375       vop = voprnds[j - 1];
4376       vec_oprnds->quick_push (vop);
4377     }
4378
4379   voprnds.release ();
4380
4381   /* In case that VF is greater than the unrolling factor needed for the SLP
4382      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4383      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4384      to replicate the vectors.  */
4385   tree neutral_vec = NULL;
4386   while (number_of_vectors > vec_oprnds->length ())
4387     {
4388       if (neutral_op)
4389         {
4390           if (!neutral_vec)
4391             {
4392               gimple_seq ctor_seq = NULL;
4393               neutral_vec = gimple_build_vector_from_val
4394                 (&ctor_seq, vector_type, neutral_op);
4395               if (ctor_seq != NULL)
4396                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4397             }
4398           vec_oprnds->quick_push (neutral_vec);
4399         }
4400       else
4401         {
4402           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4403             vec_oprnds->quick_push (vop);
4404         }
4405     }
4406 }
4407
4408
4409 /* Function vect_create_epilog_for_reduction
4410
4411    Create code at the loop-epilog to finalize the result of a reduction
4412    computation.
4413
4414    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4415      reduction statements.
4416    STMT is the scalar reduction stmt that is being vectorized.
4417    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4418      number of elements that we can fit in a vectype (nunits).  In this case
4419      we have to generate more than one vector stmt - i.e - we need to "unroll"
4420      the vector stmt by a factor VF/nunits.  For more details see documentation
4421      in vectorizable_operation.
4422    REDUC_FN is the internal function for the epilog reduction.
4423    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4424      computation.
4425    REDUC_INDEX is the index of the operand in the right hand side of the
4426      statement that is defined by REDUCTION_PHI.
4427    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4428    SLP_NODE is an SLP node containing a group of reduction statements. The
4429      first one in this group is STMT.
4430    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4431      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4432      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4433      any value of the IV in the loop.
4434    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4435    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4436      null if this is not an SLP reduction
4437
4438    This function:
4439    1. Creates the reduction def-use cycles: sets the arguments for
4440       REDUCTION_PHIS:
4441       The loop-entry argument is the vectorized initial-value of the reduction.
4442       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4443       sums.
4444    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4445       by calling the function specified by REDUC_FN if available, or by
4446       other means (whole-vector shifts or a scalar loop).
4447       The function also creates a new phi node at the loop exit to preserve
4448       loop-closed form, as illustrated below.
4449
4450      The flow at the entry to this function:
4451
4452         loop:
4453           vec_def = phi <null, null>            # REDUCTION_PHI
4454           VECT_DEF = vector_stmt                # vectorized form of STMT
4455           s_loop = scalar_stmt                  # (scalar) STMT
4456         loop_exit:
4457           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4458           use <s_out0>
4459           use <s_out0>
4460
4461      The above is transformed by this function into:
4462
4463         loop:
4464           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4465           VECT_DEF = vector_stmt                # vectorized form of STMT
4466           s_loop = scalar_stmt                  # (scalar) STMT
4467         loop_exit:
4468           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4469           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4470           v_out2 = reduce <v_out1>
4471           s_out3 = extract_field <v_out2, 0>
4472           s_out4 = adjust_result <s_out3>
4473           use <s_out4>
4474           use <s_out4>
4475 */
4476
4477 static void
4478 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4479                                   gimple *reduc_def_stmt,
4480                                   int ncopies, internal_fn reduc_fn,
4481                                   vec<gimple *> reduction_phis,
4482                                   bool double_reduc,
4483                                   slp_tree slp_node,
4484                                   slp_instance slp_node_instance,
4485                                   tree induc_val, enum tree_code induc_code,
4486                                   tree neutral_op)
4487 {
4488   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4489   stmt_vec_info prev_phi_info;
4490   tree vectype;
4491   machine_mode mode;
4492   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4493   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4494   basic_block exit_bb;
4495   tree scalar_dest;
4496   tree scalar_type;
4497   gimple *new_phi = NULL, *phi;
4498   gimple_stmt_iterator exit_gsi;
4499   tree vec_dest;
4500   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4501   gimple *epilog_stmt = NULL;
4502   enum tree_code code = gimple_assign_rhs_code (stmt);
4503   gimple *exit_phi;
4504   tree bitsize;
4505   tree adjustment_def = NULL;
4506   tree vec_initial_def = NULL;
4507   tree expr, def, initial_def = NULL;
4508   tree orig_name, scalar_result;
4509   imm_use_iterator imm_iter, phi_imm_iter;
4510   use_operand_p use_p, phi_use_p;
4511   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4512   bool nested_in_vect_loop = false;
4513   auto_vec<gimple *> new_phis;
4514   auto_vec<gimple *> inner_phis;
4515   enum vect_def_type dt = vect_unknown_def_type;
4516   int j, i;
4517   auto_vec<tree> scalar_results;
4518   unsigned int group_size = 1, k, ratio;
4519   auto_vec<tree> vec_initial_defs;
4520   auto_vec<gimple *> phis;
4521   bool slp_reduc = false;
4522   bool direct_slp_reduc;
4523   tree new_phi_result;
4524   gimple *inner_phi = NULL;
4525   tree induction_index = NULL_TREE;
4526
4527   if (slp_node)
4528     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4529
4530   if (nested_in_vect_loop_p (loop, stmt))
4531     {
4532       outer_loop = loop;
4533       loop = loop->inner;
4534       nested_in_vect_loop = true;
4535       gcc_assert (!slp_node);
4536     }
4537
4538   vectype = STMT_VINFO_VECTYPE (stmt_info);
4539   gcc_assert (vectype);
4540   mode = TYPE_MODE (vectype);
4541
4542   /* 1. Create the reduction def-use cycle:
4543      Set the arguments of REDUCTION_PHIS, i.e., transform
4544
4545         loop:
4546           vec_def = phi <null, null>            # REDUCTION_PHI
4547           VECT_DEF = vector_stmt                # vectorized form of STMT
4548           ...
4549
4550      into:
4551
4552         loop:
4553           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4554           VECT_DEF = vector_stmt                # vectorized form of STMT
4555           ...
4556
4557      (in case of SLP, do it for all the phis). */
4558
4559   /* Get the loop-entry arguments.  */
4560   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4561   if (slp_node)
4562     {
4563       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4564       vec_initial_defs.reserve (vec_num);
4565       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4566                                       &vec_initial_defs, vec_num,
4567                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4568                                       neutral_op);
4569     }
4570   else
4571     {
4572       /* Get at the scalar def before the loop, that defines the initial value
4573          of the reduction variable.  */
4574       gimple *def_stmt;
4575       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4576                                            loop_preheader_edge (loop));
4577       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4578          and we can't use zero for induc_val, use initial_def.  Similarly
4579          for REDUC_MIN and initial_def larger than the base.  */
4580       if (TREE_CODE (initial_def) == INTEGER_CST
4581           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4582               == INTEGER_INDUC_COND_REDUCTION)
4583           && !integer_zerop (induc_val)
4584           && ((induc_code == MAX_EXPR
4585                && tree_int_cst_lt (initial_def, induc_val))
4586               || (induc_code == MIN_EXPR
4587                   && tree_int_cst_lt (induc_val, initial_def))))
4588         induc_val = initial_def;
4589       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4590       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4591                                                        &adjustment_def);
4592       vec_initial_defs.create (1);
4593       vec_initial_defs.quick_push (vec_initial_def);
4594     }
4595
4596   /* Set phi nodes arguments.  */
4597   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4598     {
4599       tree vec_init_def = vec_initial_defs[i];
4600       tree def = vect_defs[i];
4601       for (j = 0; j < ncopies; j++)
4602         {
4603           if (j != 0)
4604             {
4605               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4606               if (nested_in_vect_loop)
4607                 vec_init_def
4608                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4609                                                     vec_init_def);
4610             }
4611
4612           /* Set the loop-entry arg of the reduction-phi.  */
4613
4614           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4615               == INTEGER_INDUC_COND_REDUCTION)
4616             {
4617               /* Initialise the reduction phi to zero.  This prevents initial
4618                  values of non-zero interferring with the reduction op.  */
4619               gcc_assert (ncopies == 1);
4620               gcc_assert (i == 0);
4621
4622               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4623               tree induc_val_vec
4624                 = build_vector_from_val (vec_init_def_type, induc_val);
4625
4626               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4627                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4628             }
4629           else
4630             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4631                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4632
4633           /* Set the loop-latch arg for the reduction-phi.  */
4634           if (j > 0)
4635             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4636
4637           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4638                        UNKNOWN_LOCATION);
4639
4640           if (dump_enabled_p ())
4641             {
4642               dump_printf_loc (MSG_NOTE, vect_location,
4643                                "transform reduction: created def-use cycle: ");
4644               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4645               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4646             }
4647         }
4648     }
4649
4650   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4651      which is updated with the current index of the loop for every match of
4652      the original loop's cond_expr (VEC_STMT).  This results in a vector
4653      containing the last time the condition passed for that vector lane.
4654      The first match will be a 1 to allow 0 to be used for non-matching
4655      indexes.  If there are no matches at all then the vector will be all
4656      zeroes.  */
4657   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4658     {
4659       tree indx_before_incr, indx_after_incr;
4660       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4661
4662       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4663       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4664
4665       int scalar_precision
4666         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4667       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4668       tree cr_index_vector_type = build_vector_type
4669         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4670
4671       /* First we create a simple vector induction variable which starts
4672          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4673          vector size (STEP).  */
4674
4675       /* Create a {1,2,3,...} vector.  */
4676       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4677
4678       /* Create a vector of the step value.  */
4679       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4680       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4681
4682       /* Create an induction variable.  */
4683       gimple_stmt_iterator incr_gsi;
4684       bool insert_after;
4685       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4686       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4687                  insert_after, &indx_before_incr, &indx_after_incr);
4688
4689       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4690          filled with zeros (VEC_ZERO).  */
4691
4692       /* Create a vector of 0s.  */
4693       tree zero = build_zero_cst (cr_index_scalar_type);
4694       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4695
4696       /* Create a vector phi node.  */
4697       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4698       new_phi = create_phi_node (new_phi_tree, loop->header);
4699       set_vinfo_for_stmt (new_phi,
4700                           new_stmt_vec_info (new_phi, loop_vinfo));
4701       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4702                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4703
4704       /* Now take the condition from the loops original cond_expr
4705          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4706          every match uses values from the induction variable
4707          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4708          (NEW_PHI_TREE).
4709          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4710          the new cond_expr (INDEX_COND_EXPR).  */
4711
4712       /* Duplicate the condition from vec_stmt.  */
4713       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4714
4715       /* Create a conditional, where the condition is taken from vec_stmt
4716          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4717          else is the phi (NEW_PHI_TREE).  */
4718       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4719                                      ccompare, indx_before_incr,
4720                                      new_phi_tree);
4721       induction_index = make_ssa_name (cr_index_vector_type);
4722       gimple *index_condition = gimple_build_assign (induction_index,
4723                                                      index_cond_expr);
4724       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4725       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4726                                                         loop_vinfo);
4727       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4728       set_vinfo_for_stmt (index_condition, index_vec_info);
4729
4730       /* Update the phi with the vec cond.  */
4731       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4732                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4733     }
4734
4735   /* 2. Create epilog code.
4736         The reduction epilog code operates across the elements of the vector
4737         of partial results computed by the vectorized loop.
4738         The reduction epilog code consists of:
4739
4740         step 1: compute the scalar result in a vector (v_out2)
4741         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4742         step 3: adjust the scalar result (s_out3) if needed.
4743
4744         Step 1 can be accomplished using one the following three schemes:
4745           (scheme 1) using reduc_fn, if available.
4746           (scheme 2) using whole-vector shifts, if available.
4747           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4748                      combined.
4749
4750           The overall epilog code looks like this:
4751
4752           s_out0 = phi <s_loop>         # original EXIT_PHI
4753           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4754           v_out2 = reduce <v_out1>              # step 1
4755           s_out3 = extract_field <v_out2, 0>    # step 2
4756           s_out4 = adjust_result <s_out3>       # step 3
4757
4758           (step 3 is optional, and steps 1 and 2 may be combined).
4759           Lastly, the uses of s_out0 are replaced by s_out4.  */
4760
4761
4762   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4763          v_out1 = phi <VECT_DEF>
4764          Store them in NEW_PHIS.  */
4765
4766   exit_bb = single_exit (loop)->dest;
4767   prev_phi_info = NULL;
4768   new_phis.create (vect_defs.length ());
4769   FOR_EACH_VEC_ELT (vect_defs, i, def)
4770     {
4771       for (j = 0; j < ncopies; j++)
4772         {
4773           tree new_def = copy_ssa_name (def);
4774           phi = create_phi_node (new_def, exit_bb);
4775           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4776           if (j == 0)
4777             new_phis.quick_push (phi);
4778           else
4779             {
4780               def = vect_get_vec_def_for_stmt_copy (dt, def);
4781               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4782             }
4783
4784           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4785           prev_phi_info = vinfo_for_stmt (phi);
4786         }
4787     }
4788
4789   /* The epilogue is created for the outer-loop, i.e., for the loop being
4790      vectorized.  Create exit phis for the outer loop.  */
4791   if (double_reduc)
4792     {
4793       loop = outer_loop;
4794       exit_bb = single_exit (loop)->dest;
4795       inner_phis.create (vect_defs.length ());
4796       FOR_EACH_VEC_ELT (new_phis, i, phi)
4797         {
4798           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4799           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4800           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4801                            PHI_RESULT (phi));
4802           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4803                                                             loop_vinfo));
4804           inner_phis.quick_push (phi);
4805           new_phis[i] = outer_phi;
4806           prev_phi_info = vinfo_for_stmt (outer_phi);
4807           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4808             {
4809               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4810               new_result = copy_ssa_name (PHI_RESULT (phi));
4811               outer_phi = create_phi_node (new_result, exit_bb);
4812               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4813                                PHI_RESULT (phi));
4814               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4815                                                                 loop_vinfo));
4816               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4817               prev_phi_info = vinfo_for_stmt (outer_phi);
4818             }
4819         }
4820     }
4821
4822   exit_gsi = gsi_after_labels (exit_bb);
4823
4824   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4825          (i.e. when reduc_fn is not available) and in the final adjustment
4826          code (if needed).  Also get the original scalar reduction variable as
4827          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4828          represents a reduction pattern), the tree-code and scalar-def are
4829          taken from the original stmt that the pattern-stmt (STMT) replaces.
4830          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4831          are taken from STMT.  */
4832
4833   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4834   if (!orig_stmt)
4835     {
4836       /* Regular reduction  */
4837       orig_stmt = stmt;
4838     }
4839   else
4840     {
4841       /* Reduction pattern  */
4842       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4843       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4844       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4845     }
4846
4847   code = gimple_assign_rhs_code (orig_stmt);
4848   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4849      partial results are added and not subtracted.  */
4850   if (code == MINUS_EXPR)
4851     code = PLUS_EXPR;
4852
4853   scalar_dest = gimple_assign_lhs (orig_stmt);
4854   scalar_type = TREE_TYPE (scalar_dest);
4855   scalar_results.create (group_size);
4856   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4857   bitsize = TYPE_SIZE (scalar_type);
4858
4859   /* In case this is a reduction in an inner-loop while vectorizing an outer
4860      loop - we don't need to extract a single scalar result at the end of the
4861      inner-loop (unless it is double reduction, i.e., the use of reduction is
4862      outside the outer-loop).  The final vector of partial results will be used
4863      in the vectorized outer-loop, or reduced to a scalar result at the end of
4864      the outer-loop.  */
4865   if (nested_in_vect_loop && !double_reduc)
4866     goto vect_finalize_reduction;
4867
4868   /* SLP reduction without reduction chain, e.g.,
4869      # a1 = phi <a2, a0>
4870      # b1 = phi <b2, b0>
4871      a2 = operation (a1)
4872      b2 = operation (b1)  */
4873   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4874
4875   /* True if we should implement SLP_REDUC using native reduction operations
4876      instead of scalar operations.  */
4877   direct_slp_reduc = (reduc_fn != IFN_LAST
4878                       && slp_reduc
4879                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4880
4881   /* In case of reduction chain, e.g.,
4882      # a1 = phi <a3, a0>
4883      a2 = operation (a1)
4884      a3 = operation (a2),
4885
4886      we may end up with more than one vector result.  Here we reduce them to
4887      one vector.  */
4888   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4889     {
4890       tree first_vect = PHI_RESULT (new_phis[0]);
4891       gassign *new_vec_stmt = NULL;
4892       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4893       for (k = 1; k < new_phis.length (); k++)
4894         {
4895           gimple *next_phi = new_phis[k];
4896           tree second_vect = PHI_RESULT (next_phi);
4897           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4898           new_vec_stmt = gimple_build_assign (tem, code,
4899                                               first_vect, second_vect);
4900           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4901           first_vect = tem;
4902         }
4903
4904       new_phi_result = first_vect;
4905       if (new_vec_stmt)
4906         {
4907           new_phis.truncate (0);
4908           new_phis.safe_push (new_vec_stmt);
4909         }
4910     }
4911   /* Likewise if we couldn't use a single defuse cycle.  */
4912   else if (ncopies > 1)
4913     {
4914       gcc_assert (new_phis.length () == 1);
4915       tree first_vect = PHI_RESULT (new_phis[0]);
4916       gassign *new_vec_stmt = NULL;
4917       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4918       gimple *next_phi = new_phis[0];
4919       for (int k = 1; k < ncopies; ++k)
4920         {
4921           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4922           tree second_vect = PHI_RESULT (next_phi);
4923           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4924           new_vec_stmt = gimple_build_assign (tem, code,
4925                                               first_vect, second_vect);
4926           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4927           first_vect = tem;
4928         }
4929       new_phi_result = first_vect;
4930       new_phis.truncate (0);
4931       new_phis.safe_push (new_vec_stmt);
4932     }
4933   else
4934     new_phi_result = PHI_RESULT (new_phis[0]);
4935
4936   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4937       && reduc_fn != IFN_LAST)
4938     {
4939       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4940          various data values where the condition matched and another vector
4941          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4942          need to extract the last matching index (which will be the index with
4943          highest value) and use this to index into the data vector.
4944          For the case where there were no matches, the data vector will contain
4945          all default values and the index vector will be all zeros.  */
4946
4947       /* Get various versions of the type of the vector of indexes.  */
4948       tree index_vec_type = TREE_TYPE (induction_index);
4949       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4950       tree index_scalar_type = TREE_TYPE (index_vec_type);
4951       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4952         (index_vec_type);
4953
4954       /* Get an unsigned integer version of the type of the data vector.  */
4955       int scalar_precision
4956         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4957       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4958       tree vectype_unsigned = build_vector_type
4959         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4960
4961       /* First we need to create a vector (ZERO_VEC) of zeros and another
4962          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4963          can create using a MAX reduction and then expanding.
4964          In the case where the loop never made any matches, the max index will
4965          be zero.  */
4966
4967       /* Vector of {0, 0, 0,...}.  */
4968       tree zero_vec = make_ssa_name (vectype);
4969       tree zero_vec_rhs = build_zero_cst (vectype);
4970       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4971       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4972
4973       /* Find maximum value from the vector of found indexes.  */
4974       tree max_index = make_ssa_name (index_scalar_type);
4975       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4976                                                           1, induction_index);
4977       gimple_call_set_lhs (max_index_stmt, max_index);
4978       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4979
4980       /* Vector of {max_index, max_index, max_index,...}.  */
4981       tree max_index_vec = make_ssa_name (index_vec_type);
4982       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4983                                                       max_index);
4984       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4985                                                         max_index_vec_rhs);
4986       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4987
4988       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4989          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4990          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4991          otherwise.  Only one value should match, resulting in a vector
4992          (VEC_COND) with one data value and the rest zeros.
4993          In the case where the loop never made any matches, every index will
4994          match, resulting in a vector with all data values (which will all be
4995          the default value).  */
4996
4997       /* Compare the max index vector to the vector of found indexes to find
4998          the position of the max value.  */
4999       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5000       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5001                                                       induction_index,
5002                                                       max_index_vec);
5003       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5004
5005       /* Use the compare to choose either values from the data vector or
5006          zero.  */
5007       tree vec_cond = make_ssa_name (vectype);
5008       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5009                                                    vec_compare, new_phi_result,
5010                                                    zero_vec);
5011       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5012
5013       /* Finally we need to extract the data value from the vector (VEC_COND)
5014          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5015          reduction, but because this doesn't exist, we can use a MAX reduction
5016          instead.  The data value might be signed or a float so we need to cast
5017          it first.
5018          In the case where the loop never made any matches, the data values are
5019          all identical, and so will reduce down correctly.  */
5020
5021       /* Make the matched data values unsigned.  */
5022       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5023       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5024                                        vec_cond);
5025       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5026                                                         VIEW_CONVERT_EXPR,
5027                                                         vec_cond_cast_rhs);
5028       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5029
5030       /* Reduce down to a scalar value.  */
5031       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5032       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5033                                                            1, vec_cond_cast);
5034       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5035       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5036
5037       /* Convert the reduced value back to the result type and set as the
5038          result.  */
5039       gimple_seq stmts = NULL;
5040       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5041                                data_reduc);
5042       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5043       scalar_results.safe_push (new_temp);
5044     }
5045   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5046            && reduc_fn == IFN_LAST)
5047     {
5048       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5049          idx = 0;
5050          idx_val = induction_index[0];
5051          val = data_reduc[0];
5052          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5053            if (induction_index[i] > idx_val)
5054              val = data_reduc[i], idx_val = induction_index[i];
5055          return val;  */
5056
5057       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5058       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5059       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5060       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5061       /* Enforced by vectorizable_reduction, which ensures we have target
5062          support before allowing a conditional reduction on variable-length
5063          vectors.  */
5064       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5065       tree idx_val = NULL_TREE, val = NULL_TREE;
5066       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5067         {
5068           tree old_idx_val = idx_val;
5069           tree old_val = val;
5070           idx_val = make_ssa_name (idx_eltype);
5071           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5072                                              build3 (BIT_FIELD_REF, idx_eltype,
5073                                                      induction_index,
5074                                                      bitsize_int (el_size),
5075                                                      bitsize_int (off)));
5076           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5077           val = make_ssa_name (data_eltype);
5078           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5079                                              build3 (BIT_FIELD_REF,
5080                                                      data_eltype,
5081                                                      new_phi_result,
5082                                                      bitsize_int (el_size),
5083                                                      bitsize_int (off)));
5084           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5085           if (off != 0)
5086             {
5087               tree new_idx_val = idx_val;
5088               tree new_val = val;
5089               if (off != v_size - el_size)
5090                 {
5091                   new_idx_val = make_ssa_name (idx_eltype);
5092                   epilog_stmt = gimple_build_assign (new_idx_val,
5093                                                      MAX_EXPR, idx_val,
5094                                                      old_idx_val);
5095                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5096                 }
5097               new_val = make_ssa_name (data_eltype);
5098               epilog_stmt = gimple_build_assign (new_val,
5099                                                  COND_EXPR,
5100                                                  build2 (GT_EXPR,
5101                                                          boolean_type_node,
5102                                                          idx_val,
5103                                                          old_idx_val),
5104                                                  val, old_val);
5105               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5106               idx_val = new_idx_val;
5107               val = new_val;
5108             }
5109         }
5110       /* Convert the reduced value back to the result type and set as the
5111          result.  */
5112       gimple_seq stmts = NULL;
5113       val = gimple_convert (&stmts, scalar_type, val);
5114       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5115       scalar_results.safe_push (val);
5116     }
5117
5118   /* 2.3 Create the reduction code, using one of the three schemes described
5119          above. In SLP we simply need to extract all the elements from the
5120          vector (without reducing them), so we use scalar shifts.  */
5121   else if (reduc_fn != IFN_LAST && !slp_reduc)
5122     {
5123       tree tmp;
5124       tree vec_elem_type;
5125
5126       /* Case 1:  Create:
5127          v_out2 = reduc_expr <v_out1>  */
5128
5129       if (dump_enabled_p ())
5130         dump_printf_loc (MSG_NOTE, vect_location,
5131                          "Reduce using direct vector reduction.\n");
5132
5133       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5134       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5135         {
5136           tree tmp_dest
5137             = vect_create_destination_var (scalar_dest, vec_elem_type);
5138           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5139                                                     new_phi_result);
5140           gimple_set_lhs (epilog_stmt, tmp_dest);
5141           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5142           gimple_set_lhs (epilog_stmt, new_temp);
5143           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5144
5145           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5146                                              new_temp);
5147         }
5148       else
5149         {
5150           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5151                                                     new_phi_result);
5152           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5153         }
5154
5155       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5156       gimple_set_lhs (epilog_stmt, new_temp);
5157       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5158
5159       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5160            == INTEGER_INDUC_COND_REDUCTION)
5161           && !operand_equal_p (initial_def, induc_val, 0))
5162         {
5163           /* Earlier we set the initial value to be a vector if induc_val
5164              values.  Check the result and if it is induc_val then replace
5165              with the original initial value, unless induc_val is
5166              the same as initial_def already.  */
5167           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5168                                   induc_val);
5169
5170           tmp = make_ssa_name (new_scalar_dest);
5171           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5172                                              initial_def, new_temp);
5173           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5174           new_temp = tmp;
5175         }
5176
5177       scalar_results.safe_push (new_temp);
5178     }
5179   else if (direct_slp_reduc)
5180     {
5181       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5182          with the elements for other SLP statements replaced with the
5183          neutral value.  We can then do a normal reduction on each vector.  */
5184
5185       /* Enforced by vectorizable_reduction.  */
5186       gcc_assert (new_phis.length () == 1);
5187       gcc_assert (pow2p_hwi (group_size));
5188
5189       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5190       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5191       gimple_seq seq = NULL;
5192
5193       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5194          and the same element size as VECTYPE.  */
5195       tree index = build_index_vector (vectype, 0, 1);
5196       tree index_type = TREE_TYPE (index);
5197       tree index_elt_type = TREE_TYPE (index_type);
5198       tree mask_type = build_same_sized_truth_vector_type (index_type);
5199
5200       /* Create a vector that, for each element, identifies which of
5201          the REDUC_GROUP_SIZE results should use it.  */
5202       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5203       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5204                             build_vector_from_val (index_type, index_mask));
5205
5206       /* Get a neutral vector value.  This is simply a splat of the neutral
5207          scalar value if we have one, otherwise the initial scalar value
5208          is itself a neutral value.  */
5209       tree vector_identity = NULL_TREE;
5210       if (neutral_op)
5211         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5212                                                         neutral_op);
5213       for (unsigned int i = 0; i < group_size; ++i)
5214         {
5215           /* If there's no univeral neutral value, we can use the
5216              initial scalar value from the original PHI.  This is used
5217              for MIN and MAX reduction, for example.  */
5218           if (!neutral_op)
5219             {
5220               tree scalar_value
5221                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5222                                          loop_preheader_edge (loop));
5223               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5224                                                               scalar_value);
5225             }
5226
5227           /* Calculate the equivalent of:
5228
5229              sel[j] = (index[j] == i);
5230
5231              which selects the elements of NEW_PHI_RESULT that should
5232              be included in the result.  */
5233           tree compare_val = build_int_cst (index_elt_type, i);
5234           compare_val = build_vector_from_val (index_type, compare_val);
5235           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5236                                    index, compare_val);
5237
5238           /* Calculate the equivalent of:
5239
5240              vec = seq ? new_phi_result : vector_identity;
5241
5242              VEC is now suitable for a full vector reduction.  */
5243           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5244                                    sel, new_phi_result, vector_identity);
5245
5246           /* Do the reduction and convert it to the appropriate type.  */
5247           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5248                                       TREE_TYPE (vectype), vec);
5249           scalar = gimple_convert (&seq, scalar_type, scalar);
5250           scalar_results.safe_push (scalar);
5251         }
5252       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5253     }
5254   else
5255     {
5256       bool reduce_with_shift;
5257       tree vec_temp;
5258
5259       /* COND reductions all do the final reduction with MAX_EXPR
5260          or MIN_EXPR.  */
5261       if (code == COND_EXPR)
5262         {
5263           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5264               == INTEGER_INDUC_COND_REDUCTION)
5265             code = induc_code;
5266           else
5267             code = MAX_EXPR;
5268         }
5269
5270       /* See if the target wants to do the final (shift) reduction
5271          in a vector mode of smaller size and first reduce upper/lower
5272          halves against each other.  */
5273       enum machine_mode mode1 = mode;
5274       tree vectype1 = vectype;
5275       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5276       unsigned sz1 = sz;
5277       if (!slp_reduc
5278           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5279         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5280
5281       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5282       reduce_with_shift = have_whole_vector_shift (mode1);
5283       if (!VECTOR_MODE_P (mode1))
5284         reduce_with_shift = false;
5285       else
5286         {
5287           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5288           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5289             reduce_with_shift = false;
5290         }
5291
5292       /* First reduce the vector to the desired vector size we should
5293          do shift reduction on by combining upper and lower halves.  */
5294       new_temp = new_phi_result;
5295       while (sz > sz1)
5296         {
5297           gcc_assert (!slp_reduc);
5298           sz /= 2;
5299           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5300
5301           /* The target has to make sure we support lowpart/highpart
5302              extraction, either via direct vector extract or through
5303              an integer mode punning.  */
5304           tree dst1, dst2;
5305           if (convert_optab_handler (vec_extract_optab,
5306                                      TYPE_MODE (TREE_TYPE (new_temp)),
5307                                      TYPE_MODE (vectype1))
5308               != CODE_FOR_nothing)
5309             {
5310               /* Extract sub-vectors directly once vec_extract becomes
5311                  a conversion optab.  */
5312               dst1 = make_ssa_name (vectype1);
5313               epilog_stmt
5314                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5315                                          build3 (BIT_FIELD_REF, vectype1,
5316                                                  new_temp, TYPE_SIZE (vectype1),
5317                                                  bitsize_int (0)));
5318               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5319               dst2 =  make_ssa_name (vectype1);
5320               epilog_stmt
5321                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5322                                          build3 (BIT_FIELD_REF, vectype1,
5323                                                  new_temp, TYPE_SIZE (vectype1),
5324                                                  bitsize_int (sz * BITS_PER_UNIT)));
5325               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5326             }
5327           else
5328             {
5329               /* Extract via punning to appropriately sized integer mode
5330                  vector.  */
5331               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5332                                                             1);
5333               tree etype = build_vector_type (eltype, 2);
5334               gcc_assert (convert_optab_handler (vec_extract_optab,
5335                                                  TYPE_MODE (etype),
5336                                                  TYPE_MODE (eltype))
5337                           != CODE_FOR_nothing);
5338               tree tem = make_ssa_name (etype);
5339               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5340                                                  build1 (VIEW_CONVERT_EXPR,
5341                                                          etype, new_temp));
5342               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5343               new_temp = tem;
5344               tem = make_ssa_name (eltype);
5345               epilog_stmt
5346                   = gimple_build_assign (tem, BIT_FIELD_REF,
5347                                          build3 (BIT_FIELD_REF, eltype,
5348                                                  new_temp, TYPE_SIZE (eltype),
5349                                                  bitsize_int (0)));
5350               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5351               dst1 = make_ssa_name (vectype1);
5352               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5353                                                  build1 (VIEW_CONVERT_EXPR,
5354                                                          vectype1, tem));
5355               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5356               tem = make_ssa_name (eltype);
5357               epilog_stmt
5358                   = gimple_build_assign (tem, BIT_FIELD_REF,
5359                                          build3 (BIT_FIELD_REF, eltype,
5360                                                  new_temp, TYPE_SIZE (eltype),
5361                                                  bitsize_int (sz * BITS_PER_UNIT)));
5362               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5363               dst2 =  make_ssa_name (vectype1);
5364               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5365                                                  build1 (VIEW_CONVERT_EXPR,
5366                                                          vectype1, tem));
5367               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5368             }
5369
5370           new_temp = make_ssa_name (vectype1);
5371           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5372           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5373         }
5374
5375       if (reduce_with_shift && !slp_reduc)
5376         {
5377           int element_bitsize = tree_to_uhwi (bitsize);
5378           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5379              for variable-length vectors and also requires direct target support
5380              for loop reductions.  */
5381           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5382           int nelements = vec_size_in_bits / element_bitsize;
5383           vec_perm_builder sel;
5384           vec_perm_indices indices;
5385
5386           int elt_offset;
5387
5388           tree zero_vec = build_zero_cst (vectype1);
5389           /* Case 2: Create:
5390              for (offset = nelements/2; offset >= 1; offset/=2)
5391                 {
5392                   Create:  va' = vec_shift <va, offset>
5393                   Create:  va = vop <va, va'>
5394                 }  */
5395
5396           tree rhs;
5397
5398           if (dump_enabled_p ())
5399             dump_printf_loc (MSG_NOTE, vect_location,
5400                              "Reduce using vector shifts\n");
5401
5402           mode1 = TYPE_MODE (vectype1);
5403           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5404           for (elt_offset = nelements / 2;
5405                elt_offset >= 1;
5406                elt_offset /= 2)
5407             {
5408               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5409               indices.new_vector (sel, 2, nelements);
5410               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5411               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5412                                                  new_temp, zero_vec, mask);
5413               new_name = make_ssa_name (vec_dest, epilog_stmt);
5414               gimple_assign_set_lhs (epilog_stmt, new_name);
5415               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5416
5417               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5418                                                  new_temp);
5419               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5420               gimple_assign_set_lhs (epilog_stmt, new_temp);
5421               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5422             }
5423
5424           /* 2.4  Extract the final scalar result.  Create:
5425              s_out3 = extract_field <v_out2, bitpos>  */
5426
5427           if (dump_enabled_p ())
5428             dump_printf_loc (MSG_NOTE, vect_location,
5429                              "extract scalar result\n");
5430
5431           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5432                         bitsize, bitsize_zero_node);
5433           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5434           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5435           gimple_assign_set_lhs (epilog_stmt, new_temp);
5436           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5437           scalar_results.safe_push (new_temp);
5438         }
5439       else
5440         {
5441           /* Case 3: Create:
5442              s = extract_field <v_out2, 0>
5443              for (offset = element_size;
5444                   offset < vector_size;
5445                   offset += element_size;)
5446                {
5447                  Create:  s' = extract_field <v_out2, offset>
5448                  Create:  s = op <s, s'>  // For non SLP cases
5449                }  */
5450
5451           if (dump_enabled_p ())
5452             dump_printf_loc (MSG_NOTE, vect_location,
5453                              "Reduce using scalar code.\n");
5454
5455           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5456           int element_bitsize = tree_to_uhwi (bitsize);
5457           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5458             {
5459               int bit_offset;
5460               if (gimple_code (new_phi) == GIMPLE_PHI)
5461                 vec_temp = PHI_RESULT (new_phi);
5462               else
5463                 vec_temp = gimple_assign_lhs (new_phi);
5464               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5465                                  bitsize_zero_node);
5466               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5467               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5468               gimple_assign_set_lhs (epilog_stmt, new_temp);
5469               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5470
5471               /* In SLP we don't need to apply reduction operation, so we just
5472                  collect s' values in SCALAR_RESULTS.  */
5473               if (slp_reduc)
5474                 scalar_results.safe_push (new_temp);
5475
5476               for (bit_offset = element_bitsize;
5477                    bit_offset < vec_size_in_bits;
5478                    bit_offset += element_bitsize)
5479                 {
5480                   tree bitpos = bitsize_int (bit_offset);
5481                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5482                                      bitsize, bitpos);
5483
5484                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5485                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5486                   gimple_assign_set_lhs (epilog_stmt, new_name);
5487                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5488
5489                   if (slp_reduc)
5490                     {
5491                       /* In SLP we don't need to apply reduction operation, so
5492                          we just collect s' values in SCALAR_RESULTS.  */
5493                       new_temp = new_name;
5494                       scalar_results.safe_push (new_name);
5495                     }
5496                   else
5497                     {
5498                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5499                                                          new_name, new_temp);
5500                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5501                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5502                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5503                     }
5504                 }
5505             }
5506
5507           /* The only case where we need to reduce scalar results in SLP, is
5508              unrolling.  If the size of SCALAR_RESULTS is greater than
5509              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5510              REDUC_GROUP_SIZE.  */
5511           if (slp_reduc)
5512             {
5513               tree res, first_res, new_res;
5514               gimple *new_stmt;
5515
5516               /* Reduce multiple scalar results in case of SLP unrolling.  */
5517               for (j = group_size; scalar_results.iterate (j, &res);
5518                    j++)
5519                 {
5520                   first_res = scalar_results[j % group_size];
5521                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5522                                                   first_res, res);
5523                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5524                   gimple_assign_set_lhs (new_stmt, new_res);
5525                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5526                   scalar_results[j % group_size] = new_res;
5527                 }
5528             }
5529           else
5530             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5531             scalar_results.safe_push (new_temp);
5532         }
5533
5534       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5535            == INTEGER_INDUC_COND_REDUCTION)
5536           && !operand_equal_p (initial_def, induc_val, 0))
5537         {
5538           /* Earlier we set the initial value to be a vector if induc_val
5539              values.  Check the result and if it is induc_val then replace
5540              with the original initial value, unless induc_val is
5541              the same as initial_def already.  */
5542           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5543                                   induc_val);
5544
5545           tree tmp = make_ssa_name (new_scalar_dest);
5546           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5547                                              initial_def, new_temp);
5548           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5549           scalar_results[0] = tmp;
5550         }
5551     }
5552
5553 vect_finalize_reduction:
5554
5555   if (double_reduc)
5556     loop = loop->inner;
5557
5558   /* 2.5 Adjust the final result by the initial value of the reduction
5559          variable. (When such adjustment is not needed, then
5560          'adjustment_def' is zero).  For example, if code is PLUS we create:
5561          new_temp = loop_exit_def + adjustment_def  */
5562
5563   if (adjustment_def)
5564     {
5565       gcc_assert (!slp_reduc);
5566       if (nested_in_vect_loop)
5567         {
5568           new_phi = new_phis[0];
5569           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5570           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5571           new_dest = vect_create_destination_var (scalar_dest, vectype);
5572         }
5573       else
5574         {
5575           new_temp = scalar_results[0];
5576           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5577           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5578           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5579         }
5580
5581       epilog_stmt = gimple_build_assign (new_dest, expr);
5582       new_temp = make_ssa_name (new_dest, epilog_stmt);
5583       gimple_assign_set_lhs (epilog_stmt, new_temp);
5584       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5585       if (nested_in_vect_loop)
5586         {
5587           set_vinfo_for_stmt (epilog_stmt,
5588                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5589           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5590                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5591
5592           if (!double_reduc)
5593             scalar_results.quick_push (new_temp);
5594           else
5595             scalar_results[0] = new_temp;
5596         }
5597       else
5598         scalar_results[0] = new_temp;
5599
5600       new_phis[0] = epilog_stmt;
5601     }
5602
5603   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5604           phis with new adjusted scalar results, i.e., replace use <s_out0>
5605           with use <s_out4>.
5606
5607      Transform:
5608         loop_exit:
5609           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5610           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5611           v_out2 = reduce <v_out1>
5612           s_out3 = extract_field <v_out2, 0>
5613           s_out4 = adjust_result <s_out3>
5614           use <s_out0>
5615           use <s_out0>
5616
5617      into:
5618
5619         loop_exit:
5620           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5621           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5622           v_out2 = reduce <v_out1>
5623           s_out3 = extract_field <v_out2, 0>
5624           s_out4 = adjust_result <s_out3>
5625           use <s_out4>
5626           use <s_out4> */
5627
5628
5629   /* In SLP reduction chain we reduce vector results into one vector if
5630      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5631      LHS of the last stmt in the reduction chain, since we are looking for
5632      the loop exit phi node.  */
5633   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5634     {
5635       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5636       /* Handle reduction patterns.  */
5637       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5638         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5639
5640       scalar_dest = gimple_assign_lhs (dest_stmt);
5641       group_size = 1;
5642     }
5643
5644   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5645      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5646      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5647      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5648      correspond to the first vector stmt, etc.
5649      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5650   if (group_size > new_phis.length ())
5651     {
5652       ratio = group_size / new_phis.length ();
5653       gcc_assert (!(group_size % new_phis.length ()));
5654     }
5655   else
5656     ratio = 1;
5657
5658   for (k = 0; k < group_size; k++)
5659     {
5660       if (k % ratio == 0)
5661         {
5662           epilog_stmt = new_phis[k / ratio];
5663           reduction_phi = reduction_phis[k / ratio];
5664           if (double_reduc)
5665             inner_phi = inner_phis[k / ratio];
5666         }
5667
5668       if (slp_reduc)
5669         {
5670           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5671
5672           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5673           /* SLP statements can't participate in patterns.  */
5674           gcc_assert (!orig_stmt);
5675           scalar_dest = gimple_assign_lhs (current_stmt);
5676         }
5677
5678       phis.create (3);
5679       /* Find the loop-closed-use at the loop exit of the original scalar
5680          result.  (The reduction result is expected to have two immediate uses -
5681          one at the latch block, and one at the loop exit).  */
5682       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5683         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5684             && !is_gimple_debug (USE_STMT (use_p)))
5685           phis.safe_push (USE_STMT (use_p));
5686
5687       /* While we expect to have found an exit_phi because of loop-closed-ssa
5688          form we can end up without one if the scalar cycle is dead.  */
5689
5690       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5691         {
5692           if (outer_loop)
5693             {
5694               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5695               gphi *vect_phi;
5696
5697               /* FORNOW. Currently not supporting the case that an inner-loop
5698                  reduction is not used in the outer-loop (but only outside the
5699                  outer-loop), unless it is double reduction.  */
5700               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5701                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5702                           || double_reduc);
5703
5704               if (double_reduc)
5705                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5706               else
5707                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5708               if (!double_reduc
5709                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5710                       != vect_double_reduction_def)
5711                 continue;
5712
5713               /* Handle double reduction:
5714
5715                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5716                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5717                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5718                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5719
5720                  At that point the regular reduction (stmt2 and stmt3) is
5721                  already vectorized, as well as the exit phi node, stmt4.
5722                  Here we vectorize the phi node of double reduction, stmt1, and
5723                  update all relevant statements.  */
5724
5725               /* Go through all the uses of s2 to find double reduction phi
5726                  node, i.e., stmt1 above.  */
5727               orig_name = PHI_RESULT (exit_phi);
5728               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5729                 {
5730                   stmt_vec_info use_stmt_vinfo;
5731                   stmt_vec_info new_phi_vinfo;
5732                   tree vect_phi_init, preheader_arg, vect_phi_res;
5733                   basic_block bb = gimple_bb (use_stmt);
5734                   gimple *use;
5735
5736                   /* Check that USE_STMT is really double reduction phi
5737                      node.  */
5738                   if (gimple_code (use_stmt) != GIMPLE_PHI
5739                       || gimple_phi_num_args (use_stmt) != 2
5740                       || bb->loop_father != outer_loop)
5741                     continue;
5742                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5743                   if (!use_stmt_vinfo
5744                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5745                           != vect_double_reduction_def)
5746                     continue;
5747
5748                   /* Create vector phi node for double reduction:
5749                      vs1 = phi <vs0, vs2>
5750                      vs1 was created previously in this function by a call to
5751                        vect_get_vec_def_for_operand and is stored in
5752                        vec_initial_def;
5753                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5754                      vs0 is created here.  */
5755
5756                   /* Create vector phi node.  */
5757                   vect_phi = create_phi_node (vec_initial_def, bb);
5758                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5759                                     loop_vec_info_for_loop (outer_loop));
5760                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5761
5762                   /* Create vs0 - initial def of the double reduction phi.  */
5763                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5764                                              loop_preheader_edge (outer_loop));
5765                   vect_phi_init = get_initial_def_for_reduction
5766                     (stmt, preheader_arg, NULL);
5767
5768                   /* Update phi node arguments with vs0 and vs2.  */
5769                   add_phi_arg (vect_phi, vect_phi_init,
5770                                loop_preheader_edge (outer_loop),
5771                                UNKNOWN_LOCATION);
5772                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5773                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5774                   if (dump_enabled_p ())
5775                     {
5776                       dump_printf_loc (MSG_NOTE, vect_location,
5777                                        "created double reduction phi node: ");
5778                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5779                     }
5780
5781                   vect_phi_res = PHI_RESULT (vect_phi);
5782
5783                   /* Replace the use, i.e., set the correct vs1 in the regular
5784                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5785                      loop is redundant.  */
5786                   use = reduction_phi;
5787                   for (j = 0; j < ncopies; j++)
5788                     {
5789                       edge pr_edge = loop_preheader_edge (loop);
5790                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5791                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5792                     }
5793                 }
5794             }
5795         }
5796
5797       phis.release ();
5798       if (nested_in_vect_loop)
5799         {
5800           if (double_reduc)
5801             loop = outer_loop;
5802           else
5803             continue;
5804         }
5805
5806       phis.create (3);
5807       /* Find the loop-closed-use at the loop exit of the original scalar
5808          result.  (The reduction result is expected to have two immediate uses,
5809          one at the latch block, and one at the loop exit).  For double
5810          reductions we are looking for exit phis of the outer loop.  */
5811       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5812         {
5813           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5814             {
5815               if (!is_gimple_debug (USE_STMT (use_p)))
5816                 phis.safe_push (USE_STMT (use_p));
5817             }
5818           else
5819             {
5820               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5821                 {
5822                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5823
5824                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5825                     {
5826                       if (!flow_bb_inside_loop_p (loop,
5827                                              gimple_bb (USE_STMT (phi_use_p)))
5828                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5829                         phis.safe_push (USE_STMT (phi_use_p));
5830                     }
5831                 }
5832             }
5833         }
5834
5835       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5836         {
5837           /* Replace the uses:  */
5838           orig_name = PHI_RESULT (exit_phi);
5839           scalar_result = scalar_results[k];
5840           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5841             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5842               SET_USE (use_p, scalar_result);
5843         }
5844
5845       phis.release ();
5846     }
5847 }
5848
5849 /* Return a vector of type VECTYPE that is equal to the vector select
5850    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5851    before GSI.  */
5852
5853 static tree
5854 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5855                      tree vec, tree identity)
5856 {
5857   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5858   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5859                                           mask, vec, identity);
5860   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5861   return cond;
5862 }
5863
5864 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5865    order, starting with LHS.  Insert the extraction statements before GSI and
5866    associate the new scalar SSA names with variable SCALAR_DEST.
5867    Return the SSA name for the result.  */
5868
5869 static tree
5870 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5871                        tree_code code, tree lhs, tree vector_rhs)
5872 {
5873   tree vectype = TREE_TYPE (vector_rhs);
5874   tree scalar_type = TREE_TYPE (vectype);
5875   tree bitsize = TYPE_SIZE (scalar_type);
5876   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5877   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5878
5879   for (unsigned HOST_WIDE_INT bit_offset = 0;
5880        bit_offset < vec_size_in_bits;
5881        bit_offset += element_bitsize)
5882     {
5883       tree bitpos = bitsize_int (bit_offset);
5884       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5885                          bitsize, bitpos);
5886
5887       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5888       rhs = make_ssa_name (scalar_dest, stmt);
5889       gimple_assign_set_lhs (stmt, rhs);
5890       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5891
5892       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5893       tree new_name = make_ssa_name (scalar_dest, stmt);
5894       gimple_assign_set_lhs (stmt, new_name);
5895       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5896       lhs = new_name;
5897     }
5898   return lhs;
5899 }
5900
5901 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5902    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5903    statement.  CODE is the operation performed by STMT and OPS are
5904    its scalar operands.  REDUC_INDEX is the index of the operand in
5905    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5906    implements in-order reduction, or IFN_LAST if we should open-code it.
5907    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5908    that should be used to control the operation in a fully-masked loop.  */
5909
5910 static bool
5911 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5912                                gimple **vec_stmt, slp_tree slp_node,
5913                                gimple *reduc_def_stmt,
5914                                tree_code code, internal_fn reduc_fn,
5915                                tree ops[3], tree vectype_in,
5916                                int reduc_index, vec_loop_masks *masks)
5917 {
5918   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5919   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5920   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5921   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5922   gimple *new_stmt = NULL;
5923
5924   int ncopies;
5925   if (slp_node)
5926     ncopies = 1;
5927   else
5928     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5929
5930   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5931   gcc_assert (ncopies == 1);
5932   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5933   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5934   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5935               == FOLD_LEFT_REDUCTION);
5936
5937   if (slp_node)
5938     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5939                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5940
5941   tree op0 = ops[1 - reduc_index];
5942
5943   int group_size = 1;
5944   gimple *scalar_dest_def;
5945   auto_vec<tree> vec_oprnds0;
5946   if (slp_node)
5947     {
5948       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5949       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5950       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5951     }
5952   else
5953     {
5954       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5955       vec_oprnds0.create (1);
5956       vec_oprnds0.quick_push (loop_vec_def0);
5957       scalar_dest_def = stmt;
5958     }
5959
5960   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5961   tree scalar_type = TREE_TYPE (scalar_dest);
5962   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5963
5964   int vec_num = vec_oprnds0.length ();
5965   gcc_assert (vec_num == 1 || slp_node);
5966   tree vec_elem_type = TREE_TYPE (vectype_out);
5967   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5968
5969   tree vector_identity = NULL_TREE;
5970   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5971     vector_identity = build_zero_cst (vectype_out);
5972
5973   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5974   int i;
5975   tree def0;
5976   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5977     {
5978       tree mask = NULL_TREE;
5979       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5980         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5981
5982       /* Handle MINUS by adding the negative.  */
5983       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5984         {
5985           tree negated = make_ssa_name (vectype_out);
5986           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5987           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5988           def0 = negated;
5989         }
5990
5991       if (mask)
5992         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5993                                     vector_identity);
5994
5995       /* On the first iteration the input is simply the scalar phi
5996          result, and for subsequent iterations it is the output of
5997          the preceding operation.  */
5998       if (reduc_fn != IFN_LAST)
5999         {
6000           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6001           /* For chained SLP reductions the output of the previous reduction
6002              operation serves as the input of the next. For the final statement
6003              the output cannot be a temporary - we reuse the original
6004              scalar destination of the last statement.  */
6005           if (i != vec_num - 1)
6006             {
6007               gimple_set_lhs (new_stmt, scalar_dest_var);
6008               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6009               gimple_set_lhs (new_stmt, reduc_var);
6010             }
6011         }
6012       else
6013         {
6014           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6015                                              reduc_var, def0);
6016           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6017           /* Remove the statement, so that we can use the same code paths
6018              as for statements that we've just created.  */
6019           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6020           gsi_remove (&tmp_gsi, false);
6021         }
6022
6023       if (i == vec_num - 1)
6024         {
6025           gimple_set_lhs (new_stmt, scalar_dest);
6026           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6027         }
6028       else
6029         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6030
6031       if (slp_node)
6032         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6033     }
6034
6035   if (!slp_node)
6036     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6037
6038   return true;
6039 }
6040
6041 /* Function is_nonwrapping_integer_induction.
6042
6043    Check if STMT (which is part of loop LOOP) both increments and
6044    does not cause overflow.  */
6045
6046 static bool
6047 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6048 {
6049   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6050   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6051   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6052   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6053   widest_int ni, max_loop_value, lhs_max;
6054   bool overflow = false;
6055
6056   /* Make sure the loop is integer based.  */
6057   if (TREE_CODE (base) != INTEGER_CST
6058       || TREE_CODE (step) != INTEGER_CST)
6059     return false;
6060
6061   /* Check that the max size of the loop will not wrap.  */
6062
6063   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6064     return true;
6065
6066   if (! max_stmt_executions (loop, &ni))
6067     return false;
6068
6069   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6070                             &overflow);
6071   if (overflow)
6072     return false;
6073
6074   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6075                             TYPE_SIGN (lhs_type), &overflow);
6076   if (overflow)
6077     return false;
6078
6079   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6080           <= TYPE_PRECISION (lhs_type));
6081 }
6082
6083 /* Function vectorizable_reduction.
6084
6085    Check if STMT performs a reduction operation that can be vectorized.
6086    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6087    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6088    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6089
6090    This function also handles reduction idioms (patterns) that have been
6091    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6092    of this form:
6093      X = pattern_expr (arg0, arg1, ..., X)
6094    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6095    sequence that had been detected and replaced by the pattern-stmt (STMT).
6096
6097    This function also handles reduction of condition expressions, for example:
6098      for (int i = 0; i < N; i++)
6099        if (a[i] < value)
6100          last = a[i];
6101    This is handled by vectorising the loop and creating an additional vector
6102    containing the loop indexes for which "a[i] < value" was true.  In the
6103    function epilogue this is reduced to a single max value and then used to
6104    index into the vector of results.
6105
6106    In some cases of reduction patterns, the type of the reduction variable X is
6107    different than the type of the other arguments of STMT.
6108    In such cases, the vectype that is used when transforming STMT into a vector
6109    stmt is different than the vectype that is used to determine the
6110    vectorization factor, because it consists of a different number of elements
6111    than the actual number of elements that are being operated upon in parallel.
6112
6113    For example, consider an accumulation of shorts into an int accumulator.
6114    On some targets it's possible to vectorize this pattern operating on 8
6115    shorts at a time (hence, the vectype for purposes of determining the
6116    vectorization factor should be V8HI); on the other hand, the vectype that
6117    is used to create the vector form is actually V4SI (the type of the result).
6118
6119    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6120    indicates what is the actual level of parallelism (V8HI in the example), so
6121    that the right vectorization factor would be derived.  This vectype
6122    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6123    be used to create the vectorized stmt.  The right vectype for the vectorized
6124    stmt is obtained from the type of the result X:
6125         get_vectype_for_scalar_type (TREE_TYPE (X))
6126
6127    This means that, contrary to "regular" reductions (or "regular" stmts in
6128    general), the following equation:
6129       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6130    does *NOT* necessarily hold for reduction patterns.  */
6131
6132 bool
6133 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6134                         gimple **vec_stmt, slp_tree slp_node,
6135                         slp_instance slp_node_instance,
6136                         stmt_vector_for_cost *cost_vec)
6137 {
6138   tree vec_dest;
6139   tree scalar_dest;
6140   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6141   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6142   tree vectype_in = NULL_TREE;
6143   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6144   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6145   enum tree_code code, orig_code;
6146   internal_fn reduc_fn;
6147   machine_mode vec_mode;
6148   int op_type;
6149   optab optab;
6150   tree new_temp = NULL_TREE;
6151   gimple *def_stmt;
6152   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6153   gimple *cond_reduc_def_stmt = NULL;
6154   enum tree_code cond_reduc_op_code = ERROR_MARK;
6155   tree scalar_type;
6156   bool is_simple_use;
6157   gimple *orig_stmt;
6158   stmt_vec_info orig_stmt_info = NULL;
6159   int i;
6160   int ncopies;
6161   int epilog_copies;
6162   stmt_vec_info prev_stmt_info, prev_phi_info;
6163   bool single_defuse_cycle = false;
6164   gimple *new_stmt = NULL;
6165   int j;
6166   tree ops[3];
6167   enum vect_def_type dts[3];
6168   bool nested_cycle = false, found_nested_cycle_def = false;
6169   bool double_reduc = false;
6170   basic_block def_bb;
6171   struct loop * def_stmt_loop, *outer_loop = NULL;
6172   tree def_arg;
6173   gimple *def_arg_stmt;
6174   auto_vec<tree> vec_oprnds0;
6175   auto_vec<tree> vec_oprnds1;
6176   auto_vec<tree> vec_oprnds2;
6177   auto_vec<tree> vect_defs;
6178   auto_vec<gimple *> phis;
6179   int vec_num;
6180   tree def0, tem;
6181   bool first_p = true;
6182   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6183   tree cond_reduc_val = NULL_TREE;
6184
6185   /* Make sure it was already recognized as a reduction computation.  */
6186   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6187       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6188     return false;
6189
6190   if (nested_in_vect_loop_p (loop, stmt))
6191     {
6192       outer_loop = loop;
6193       loop = loop->inner;
6194       nested_cycle = true;
6195     }
6196
6197   /* In case of reduction chain we switch to the first stmt in the chain, but
6198      we don't update STMT_INFO, since only the last stmt is marked as reduction
6199      and has reduction properties.  */
6200   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6201       && REDUC_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6202     {
6203       stmt = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6204       first_p = false;
6205     }
6206
6207   if (gimple_code (stmt) == GIMPLE_PHI)
6208     {
6209       /* Analysis is fully done on the reduction stmt invocation.  */
6210       if (! vec_stmt)
6211         {
6212           if (slp_node)
6213             slp_node_instance->reduc_phis = slp_node;
6214
6215           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6216           return true;
6217         }
6218
6219       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6220         /* Leave the scalar phi in place.  Note that checking
6221            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6222            for reductions involving a single statement.  */
6223         return true;
6224
6225       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6226       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6227         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6228
6229       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6230           == EXTRACT_LAST_REDUCTION)
6231         /* Leave the scalar phi in place.  */
6232         return true;
6233
6234       gcc_assert (is_gimple_assign (reduc_stmt));
6235       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6236         {
6237           tree op = gimple_op (reduc_stmt, k);
6238           if (op == gimple_phi_result (stmt))
6239             continue;
6240           if (k == 1
6241               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6242             continue;
6243           if (!vectype_in
6244               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6245                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6246             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6247           break;
6248         }
6249       gcc_assert (vectype_in);
6250
6251       if (slp_node)
6252         ncopies = 1;
6253       else
6254         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6255
6256       use_operand_p use_p;
6257       gimple *use_stmt;
6258       if (ncopies > 1
6259           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6260               <= vect_used_only_live)
6261           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6262           && (use_stmt == reduc_stmt
6263               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6264                   == reduc_stmt)))
6265         single_defuse_cycle = true;
6266
6267       /* Create the destination vector  */
6268       scalar_dest = gimple_assign_lhs (reduc_stmt);
6269       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6270
6271       if (slp_node)
6272         /* The size vect_schedule_slp_instance computes is off for us.  */
6273         vec_num = vect_get_num_vectors
6274           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6275            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6276            vectype_in);
6277       else
6278         vec_num = 1;
6279
6280       /* Generate the reduction PHIs upfront.  */
6281       prev_phi_info = NULL;
6282       for (j = 0; j < ncopies; j++)
6283         {
6284           if (j == 0 || !single_defuse_cycle)
6285             {
6286               for (i = 0; i < vec_num; i++)
6287                 {
6288                   /* Create the reduction-phi that defines the reduction
6289                      operand.  */
6290                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6291                   set_vinfo_for_stmt (new_phi,
6292                                       new_stmt_vec_info (new_phi, loop_vinfo));
6293
6294                   if (slp_node)
6295                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6296                   else
6297                     {
6298                       if (j == 0)
6299                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6300                       else
6301                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6302                       prev_phi_info = vinfo_for_stmt (new_phi);
6303                     }
6304                 }
6305             }
6306         }
6307
6308       return true;
6309     }
6310
6311   /* 1. Is vectorizable reduction?  */
6312   /* Not supportable if the reduction variable is used in the loop, unless
6313      it's a reduction chain.  */
6314   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6315       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6316     return false;
6317
6318   /* Reductions that are not used even in an enclosing outer-loop,
6319      are expected to be "live" (used out of the loop).  */
6320   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6321       && !STMT_VINFO_LIVE_P (stmt_info))
6322     return false;
6323
6324   /* 2. Has this been recognized as a reduction pattern?
6325
6326      Check if STMT represents a pattern that has been recognized
6327      in earlier analysis stages.  For stmts that represent a pattern,
6328      the STMT_VINFO_RELATED_STMT field records the last stmt in
6329      the original sequence that constitutes the pattern.  */
6330
6331   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6332   if (orig_stmt)
6333     {
6334       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6335       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6336       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6337     }
6338
6339   /* 3. Check the operands of the operation.  The first operands are defined
6340         inside the loop body. The last operand is the reduction variable,
6341         which is defined by the loop-header-phi.  */
6342
6343   gcc_assert (is_gimple_assign (stmt));
6344
6345   /* Flatten RHS.  */
6346   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6347     {
6348     case GIMPLE_BINARY_RHS:
6349       code = gimple_assign_rhs_code (stmt);
6350       op_type = TREE_CODE_LENGTH (code);
6351       gcc_assert (op_type == binary_op);
6352       ops[0] = gimple_assign_rhs1 (stmt);
6353       ops[1] = gimple_assign_rhs2 (stmt);
6354       break;
6355
6356     case GIMPLE_TERNARY_RHS:
6357       code = gimple_assign_rhs_code (stmt);
6358       op_type = TREE_CODE_LENGTH (code);
6359       gcc_assert (op_type == ternary_op);
6360       ops[0] = gimple_assign_rhs1 (stmt);
6361       ops[1] = gimple_assign_rhs2 (stmt);
6362       ops[2] = gimple_assign_rhs3 (stmt);
6363       break;
6364
6365     case GIMPLE_UNARY_RHS:
6366       return false;
6367
6368     default:
6369       gcc_unreachable ();
6370     }
6371
6372   if (code == COND_EXPR && slp_node)
6373     return false;
6374
6375   scalar_dest = gimple_assign_lhs (stmt);
6376   scalar_type = TREE_TYPE (scalar_dest);
6377   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6378       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6379     return false;
6380
6381   /* Do not try to vectorize bit-precision reductions.  */
6382   if (!type_has_mode_precision_p (scalar_type))
6383     return false;
6384
6385   /* All uses but the last are expected to be defined in the loop.
6386      The last use is the reduction variable.  In case of nested cycle this
6387      assumption is not true: we use reduc_index to record the index of the
6388      reduction variable.  */
6389   gimple *reduc_def_stmt = NULL;
6390   int reduc_index = -1;
6391   for (i = 0; i < op_type; i++)
6392     {
6393       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6394       if (i == 0 && code == COND_EXPR)
6395         continue;
6396
6397       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6398                                           &def_stmt, &dts[i], &tem);
6399       dt = dts[i];
6400       gcc_assert (is_simple_use);
6401       if (dt == vect_reduction_def)
6402         {
6403           reduc_def_stmt = def_stmt;
6404           reduc_index = i;
6405           continue;
6406         }
6407       else if (tem)
6408         {
6409           /* To properly compute ncopies we are interested in the widest
6410              input type in case we're looking at a widening accumulation.  */
6411           if (!vectype_in
6412               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6413                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6414             vectype_in = tem;
6415         }
6416
6417       if (dt != vect_internal_def
6418           && dt != vect_external_def
6419           && dt != vect_constant_def
6420           && dt != vect_induction_def
6421           && !(dt == vect_nested_cycle && nested_cycle))
6422         return false;
6423
6424       if (dt == vect_nested_cycle)
6425         {
6426           found_nested_cycle_def = true;
6427           reduc_def_stmt = def_stmt;
6428           reduc_index = i;
6429         }
6430
6431       if (i == 1 && code == COND_EXPR)
6432         {
6433           /* Record how value of COND_EXPR is defined.  */
6434           if (dt == vect_constant_def)
6435             {
6436               cond_reduc_dt = dt;
6437               cond_reduc_val = ops[i];
6438             }
6439           if (dt == vect_induction_def
6440               && def_stmt != NULL
6441               && is_nonwrapping_integer_induction (def_stmt, loop))
6442             {
6443               cond_reduc_dt = dt;
6444               cond_reduc_def_stmt = def_stmt;
6445             }
6446         }
6447     }
6448
6449   if (!vectype_in)
6450     vectype_in = vectype_out;
6451
6452   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6453      directy used in stmt.  */
6454   if (reduc_index == -1)
6455     {
6456       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6457         {
6458           if (dump_enabled_p ())
6459             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6460                              "in-order reduction chain without SLP.\n");
6461           return false;
6462         }
6463
6464       if (orig_stmt)
6465         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6466       else
6467         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6468     }
6469
6470   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6471     return false;
6472
6473   if (!(reduc_index == -1
6474         || dts[reduc_index] == vect_reduction_def
6475         || dts[reduc_index] == vect_nested_cycle
6476         || ((dts[reduc_index] == vect_internal_def
6477              || dts[reduc_index] == vect_external_def
6478              || dts[reduc_index] == vect_constant_def
6479              || dts[reduc_index] == vect_induction_def)
6480             && nested_cycle && found_nested_cycle_def)))
6481     {
6482       /* For pattern recognized stmts, orig_stmt might be a reduction,
6483          but some helper statements for the pattern might not, or
6484          might be COND_EXPRs with reduction uses in the condition.  */
6485       gcc_assert (orig_stmt);
6486       return false;
6487     }
6488
6489   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6490   enum vect_reduction_type v_reduc_type
6491     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6492   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6493
6494   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6495   /* If we have a condition reduction, see if we can simplify it further.  */
6496   if (v_reduc_type == COND_REDUCTION)
6497     {
6498       /* TODO: We can't yet handle reduction chains, since we need to treat
6499          each COND_EXPR in the chain specially, not just the last one.
6500          E.g. for:
6501
6502             x_1 = PHI <x_3, ...>
6503             x_2 = a_2 ? ... : x_1;
6504             x_3 = a_3 ? ... : x_2;
6505
6506          we're interested in the last element in x_3 for which a_2 || a_3
6507          is true, whereas the current reduction chain handling would
6508          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6509          as a reduction operation.  */
6510       if (reduc_index == -1)
6511         {
6512           if (dump_enabled_p ())
6513             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6514                              "conditional reduction chains not supported\n");
6515           return false;
6516         }
6517
6518       /* vect_is_simple_reduction ensured that operand 2 is the
6519          loop-carried operand.  */
6520       gcc_assert (reduc_index == 2);
6521
6522       /* Loop peeling modifies initial value of reduction PHI, which
6523          makes the reduction stmt to be transformed different to the
6524          original stmt analyzed.  We need to record reduction code for
6525          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6526          it can be used directly at transform stage.  */
6527       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6528           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6529         {
6530           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6531           gcc_assert (cond_reduc_dt == vect_constant_def);
6532           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6533         }
6534       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6535                                                vectype_in, OPTIMIZE_FOR_SPEED))
6536         {
6537           if (dump_enabled_p ())
6538             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6539                              "optimizing condition reduction with"
6540                              " FOLD_EXTRACT_LAST.\n");
6541           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6542         }
6543       else if (cond_reduc_dt == vect_induction_def)
6544         {
6545           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6546           tree base
6547             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6548           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6549
6550           gcc_assert (TREE_CODE (base) == INTEGER_CST
6551                       && TREE_CODE (step) == INTEGER_CST);
6552           cond_reduc_val = NULL_TREE;
6553           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6554              above base; punt if base is the minimum value of the type for
6555              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6556           if (tree_int_cst_sgn (step) == -1)
6557             {
6558               cond_reduc_op_code = MIN_EXPR;
6559               if (tree_int_cst_sgn (base) == -1)
6560                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6561               else if (tree_int_cst_lt (base,
6562                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6563                 cond_reduc_val
6564                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6565             }
6566           else
6567             {
6568               cond_reduc_op_code = MAX_EXPR;
6569               if (tree_int_cst_sgn (base) == 1)
6570                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6571               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6572                                         base))
6573                 cond_reduc_val
6574                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6575             }
6576           if (cond_reduc_val)
6577             {
6578               if (dump_enabled_p ())
6579                 dump_printf_loc (MSG_NOTE, vect_location,
6580                                  "condition expression based on "
6581                                  "integer induction.\n");
6582               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6583                 = INTEGER_INDUC_COND_REDUCTION;
6584             }
6585         }
6586       else if (cond_reduc_dt == vect_constant_def)
6587         {
6588           enum vect_def_type cond_initial_dt;
6589           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6590           tree cond_initial_val
6591             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6592
6593           gcc_assert (cond_reduc_val != NULL_TREE);
6594           vect_is_simple_use (cond_initial_val, loop_vinfo,
6595                               &def_stmt, &cond_initial_dt);
6596           if (cond_initial_dt == vect_constant_def
6597               && types_compatible_p (TREE_TYPE (cond_initial_val),
6598                                      TREE_TYPE (cond_reduc_val)))
6599             {
6600               tree e = fold_binary (LE_EXPR, boolean_type_node,
6601                                     cond_initial_val, cond_reduc_val);
6602               if (e && (integer_onep (e) || integer_zerop (e)))
6603                 {
6604                   if (dump_enabled_p ())
6605                     dump_printf_loc (MSG_NOTE, vect_location,
6606                                      "condition expression based on "
6607                                      "compile time constant.\n");
6608                   /* Record reduction code at analysis stage.  */
6609                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6610                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6611                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6612                     = CONST_COND_REDUCTION;
6613                 }
6614             }
6615         }
6616     }
6617
6618   if (orig_stmt)
6619     gcc_assert (tmp == orig_stmt
6620                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6621                     == orig_stmt));
6622   else
6623     /* We changed STMT to be the first stmt in reduction chain, hence we
6624        check that in this case the first element in the chain is STMT.  */
6625     gcc_assert (stmt == tmp
6626                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6627
6628   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6629     return false;
6630
6631   if (slp_node)
6632     ncopies = 1;
6633   else
6634     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6635
6636   gcc_assert (ncopies >= 1);
6637
6638   vec_mode = TYPE_MODE (vectype_in);
6639   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6640
6641   if (code == COND_EXPR)
6642     {
6643       /* Only call during the analysis stage, otherwise we'll lose
6644          STMT_VINFO_TYPE.  */
6645       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6646                                                 ops[reduc_index], 0, NULL,
6647                                                 cost_vec))
6648         {
6649           if (dump_enabled_p ())
6650             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6651                              "unsupported condition in reduction\n");
6652           return false;
6653         }
6654     }
6655   else
6656     {
6657       /* 4. Supportable by target?  */
6658
6659       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6660           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6661         {
6662           /* Shifts and rotates are only supported by vectorizable_shifts,
6663              not vectorizable_reduction.  */
6664           if (dump_enabled_p ())
6665             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6666                              "unsupported shift or rotation.\n");
6667           return false;
6668         }
6669
6670       /* 4.1. check support for the operation in the loop  */
6671       optab = optab_for_tree_code (code, vectype_in, optab_default);
6672       if (!optab)
6673         {
6674           if (dump_enabled_p ())
6675             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6676                              "no optab.\n");
6677
6678           return false;
6679         }
6680
6681       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6682         {
6683           if (dump_enabled_p ())
6684             dump_printf (MSG_NOTE, "op not supported by target.\n");
6685
6686           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6687               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6688             return false;
6689
6690           if (dump_enabled_p ())
6691             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6692         }
6693
6694       /* Worthwhile without SIMD support?  */
6695       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6696           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6697         {
6698           if (dump_enabled_p ())
6699             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6700                              "not worthwhile without SIMD support.\n");
6701
6702           return false;
6703         }
6704     }
6705
6706   /* 4.2. Check support for the epilog operation.
6707
6708           If STMT represents a reduction pattern, then the type of the
6709           reduction variable may be different than the type of the rest
6710           of the arguments.  For example, consider the case of accumulation
6711           of shorts into an int accumulator; The original code:
6712                         S1: int_a = (int) short_a;
6713           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6714
6715           was replaced with:
6716                         STMT: int_acc = widen_sum <short_a, int_acc>
6717
6718           This means that:
6719           1. The tree-code that is used to create the vector operation in the
6720              epilog code (that reduces the partial results) is not the
6721              tree-code of STMT, but is rather the tree-code of the original
6722              stmt from the pattern that STMT is replacing.  I.e, in the example
6723              above we want to use 'widen_sum' in the loop, but 'plus' in the
6724              epilog.
6725           2. The type (mode) we use to check available target support
6726              for the vector operation to be created in the *epilog*, is
6727              determined by the type of the reduction variable (in the example
6728              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6729              However the type (mode) we use to check available target support
6730              for the vector operation to be created *inside the loop*, is
6731              determined by the type of the other arguments to STMT (in the
6732              example we'd check this: optab_handler (widen_sum_optab,
6733              vect_short_mode)).
6734
6735           This is contrary to "regular" reductions, in which the types of all
6736           the arguments are the same as the type of the reduction variable.
6737           For "regular" reductions we can therefore use the same vector type
6738           (and also the same tree-code) when generating the epilog code and
6739           when generating the code inside the loop.  */
6740
6741   vect_reduction_type reduction_type
6742     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6743   if (orig_stmt
6744       && (reduction_type == TREE_CODE_REDUCTION
6745           || reduction_type == FOLD_LEFT_REDUCTION))
6746     {
6747       /* This is a reduction pattern: get the vectype from the type of the
6748          reduction variable, and get the tree-code from orig_stmt.  */
6749       orig_code = gimple_assign_rhs_code (orig_stmt);
6750       gcc_assert (vectype_out);
6751       vec_mode = TYPE_MODE (vectype_out);
6752     }
6753   else
6754     {
6755       /* Regular reduction: use the same vectype and tree-code as used for
6756          the vector code inside the loop can be used for the epilog code. */
6757       orig_code = code;
6758
6759       if (code == MINUS_EXPR)
6760         orig_code = PLUS_EXPR;
6761
6762       /* For simple condition reductions, replace with the actual expression
6763          we want to base our reduction around.  */
6764       if (reduction_type == CONST_COND_REDUCTION)
6765         {
6766           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6767           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6768         }
6769       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6770         orig_code = cond_reduc_op_code;
6771     }
6772
6773   if (nested_cycle)
6774     {
6775       def_bb = gimple_bb (reduc_def_stmt);
6776       def_stmt_loop = def_bb->loop_father;
6777       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6778                                        loop_preheader_edge (def_stmt_loop));
6779       if (TREE_CODE (def_arg) == SSA_NAME
6780           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6781           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6782           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6783           && vinfo_for_stmt (def_arg_stmt)
6784           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6785               == vect_double_reduction_def)
6786         double_reduc = true;
6787     }
6788
6789   reduc_fn = IFN_LAST;
6790
6791   if (reduction_type == TREE_CODE_REDUCTION
6792       || reduction_type == FOLD_LEFT_REDUCTION
6793       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6794       || reduction_type == CONST_COND_REDUCTION)
6795     {
6796       if (reduction_type == FOLD_LEFT_REDUCTION
6797           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6798           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6799         {
6800           if (reduc_fn != IFN_LAST
6801               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6802                                                   OPTIMIZE_FOR_SPEED))
6803             {
6804               if (dump_enabled_p ())
6805                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6806                                  "reduc op not supported by target.\n");
6807
6808               reduc_fn = IFN_LAST;
6809             }
6810         }
6811       else
6812         {
6813           if (!nested_cycle || double_reduc)
6814             {
6815               if (dump_enabled_p ())
6816                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6817                                  "no reduc code for scalar code.\n");
6818
6819               return false;
6820             }
6821         }
6822     }
6823   else if (reduction_type == COND_REDUCTION)
6824     {
6825       int scalar_precision
6826         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6827       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6828       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6829                                                 nunits_out);
6830
6831       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6832                                           OPTIMIZE_FOR_SPEED))
6833         reduc_fn = IFN_REDUC_MAX;
6834     }
6835
6836   if (reduction_type != EXTRACT_LAST_REDUCTION
6837       && reduc_fn == IFN_LAST
6838       && !nunits_out.is_constant ())
6839     {
6840       if (dump_enabled_p ())
6841         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6842                          "missing target support for reduction on"
6843                          " variable-length vectors.\n");
6844       return false;
6845     }
6846
6847   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6848       && ncopies > 1)
6849     {
6850       if (dump_enabled_p ())
6851         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6852                          "multiple types in double reduction or condition "
6853                          "reduction.\n");
6854       return false;
6855     }
6856
6857   /* For SLP reductions, see if there is a neutral value we can use.  */
6858   tree neutral_op = NULL_TREE;
6859   if (slp_node)
6860     neutral_op = neutral_op_for_slp_reduction
6861                    (slp_node_instance->reduc_phis, code,
6862                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6863
6864   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6865     {
6866       /* We can't support in-order reductions of code such as this:
6867
6868            for (int i = 0; i < n1; ++i)
6869              for (int j = 0; j < n2; ++j)
6870                l += a[j];
6871
6872          since GCC effectively transforms the loop when vectorizing:
6873
6874            for (int i = 0; i < n1 / VF; ++i)
6875              for (int j = 0; j < n2; ++j)
6876                for (int k = 0; k < VF; ++k)
6877                  l += a[j];
6878
6879          which is a reassociation of the original operation.  */
6880       if (dump_enabled_p ())
6881         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6882                          "in-order double reduction not supported.\n");
6883
6884       return false;
6885     }
6886
6887   if (reduction_type == FOLD_LEFT_REDUCTION
6888       && slp_node
6889       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6890     {
6891       /* We cannot use in-order reductions in this case because there is
6892          an implicit reassociation of the operations involved.  */
6893       if (dump_enabled_p ())
6894         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6895                          "in-order unchained SLP reductions not supported.\n");
6896       return false;
6897     }
6898
6899   /* For double reductions, and for SLP reductions with a neutral value,
6900      we construct a variable-length initial vector by loading a vector
6901      full of the neutral value and then shift-and-inserting the start
6902      values into the low-numbered elements.  */
6903   if ((double_reduc || neutral_op)
6904       && !nunits_out.is_constant ()
6905       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6906                                           vectype_out, OPTIMIZE_FOR_SPEED))
6907     {
6908       if (dump_enabled_p ())
6909         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6910                          "reduction on variable-length vectors requires"
6911                          " target support for a vector-shift-and-insert"
6912                          " operation.\n");
6913       return false;
6914     }
6915
6916   /* Check extra constraints for variable-length unchained SLP reductions.  */
6917   if (STMT_SLP_TYPE (stmt_info)
6918       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6919       && !nunits_out.is_constant ())
6920     {
6921       /* We checked above that we could build the initial vector when
6922          there's a neutral element value.  Check here for the case in
6923          which each SLP statement has its own initial value and in which
6924          that value needs to be repeated for every instance of the
6925          statement within the initial vector.  */
6926       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6927       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6928       if (!neutral_op
6929           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6930         {
6931           if (dump_enabled_p ())
6932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6933                              "unsupported form of SLP reduction for"
6934                              " variable-length vectors: cannot build"
6935                              " initial vector.\n");
6936           return false;
6937         }
6938       /* The epilogue code relies on the number of elements being a multiple
6939          of the group size.  The duplicate-and-interleave approach to setting
6940          up the the initial vector does too.  */
6941       if (!multiple_p (nunits_out, group_size))
6942         {
6943           if (dump_enabled_p ())
6944             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6945                              "unsupported form of SLP reduction for"
6946                              " variable-length vectors: the vector size"
6947                              " is not a multiple of the number of results.\n");
6948           return false;
6949         }
6950     }
6951
6952   /* In case of widenning multiplication by a constant, we update the type
6953      of the constant to be the type of the other operand.  We check that the
6954      constant fits the type in the pattern recognition pass.  */
6955   if (code == DOT_PROD_EXPR
6956       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6957     {
6958       if (TREE_CODE (ops[0]) == INTEGER_CST)
6959         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6960       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6961         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6962       else
6963         {
6964           if (dump_enabled_p ())
6965             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6966                              "invalid types in dot-prod\n");
6967
6968           return false;
6969         }
6970     }
6971
6972   if (reduction_type == COND_REDUCTION)
6973     {
6974       widest_int ni;
6975
6976       if (! max_loop_iterations (loop, &ni))
6977         {
6978           if (dump_enabled_p ())
6979             dump_printf_loc (MSG_NOTE, vect_location,
6980                              "loop count not known, cannot create cond "
6981                              "reduction.\n");
6982           return false;
6983         }
6984       /* Convert backedges to iterations.  */
6985       ni += 1;
6986
6987       /* The additional index will be the same type as the condition.  Check
6988          that the loop can fit into this less one (because we'll use up the
6989          zero slot for when there are no matches).  */
6990       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6991       if (wi::geu_p (ni, wi::to_widest (max_index)))
6992         {
6993           if (dump_enabled_p ())
6994             dump_printf_loc (MSG_NOTE, vect_location,
6995                              "loop size is greater than data size.\n");
6996           return false;
6997         }
6998     }
6999
7000   /* In case the vectorization factor (VF) is bigger than the number
7001      of elements that we can fit in a vectype (nunits), we have to generate
7002      more than one vector stmt - i.e - we need to "unroll" the
7003      vector stmt by a factor VF/nunits.  For more details see documentation
7004      in vectorizable_operation.  */
7005
7006   /* If the reduction is used in an outer loop we need to generate
7007      VF intermediate results, like so (e.g. for ncopies=2):
7008         r0 = phi (init, r0)
7009         r1 = phi (init, r1)
7010         r0 = x0 + r0;
7011         r1 = x1 + r1;
7012     (i.e. we generate VF results in 2 registers).
7013     In this case we have a separate def-use cycle for each copy, and therefore
7014     for each copy we get the vector def for the reduction variable from the
7015     respective phi node created for this copy.
7016
7017     Otherwise (the reduction is unused in the loop nest), we can combine
7018     together intermediate results, like so (e.g. for ncopies=2):
7019         r = phi (init, r)
7020         r = x0 + r;
7021         r = x1 + r;
7022    (i.e. we generate VF/2 results in a single register).
7023    In this case for each copy we get the vector def for the reduction variable
7024    from the vectorized reduction operation generated in the previous iteration.
7025
7026    This only works when we see both the reduction PHI and its only consumer
7027    in vectorizable_reduction and there are no intermediate stmts
7028    participating.  */
7029   use_operand_p use_p;
7030   gimple *use_stmt;
7031   if (ncopies > 1
7032       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7033       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7034       && (use_stmt == stmt
7035           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7036     {
7037       single_defuse_cycle = true;
7038       epilog_copies = 1;
7039     }
7040   else
7041     epilog_copies = ncopies;
7042
7043   /* If the reduction stmt is one of the patterns that have lane
7044      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7045   if ((ncopies > 1
7046        && ! single_defuse_cycle)
7047       && (code == DOT_PROD_EXPR
7048           || code == WIDEN_SUM_EXPR
7049           || code == SAD_EXPR))
7050     {
7051       if (dump_enabled_p ())
7052         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7053                          "multi def-use cycle not possible for lane-reducing "
7054                          "reduction operation\n");
7055       return false;
7056     }
7057
7058   if (slp_node)
7059     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7060   else
7061     vec_num = 1;
7062
7063   internal_fn cond_fn = get_conditional_internal_fn (code);
7064   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7065
7066   if (!vec_stmt) /* transformation not required.  */
7067     {
7068       if (first_p)
7069         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7070       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7071         {
7072           if (reduction_type != FOLD_LEFT_REDUCTION
7073               && (cond_fn == IFN_LAST
7074                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7075                                                       OPTIMIZE_FOR_SPEED)))
7076             {
7077               if (dump_enabled_p ())
7078                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7079                                  "can't use a fully-masked loop because no"
7080                                  " conditional operation is available.\n");
7081               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7082             }
7083           else if (reduc_index == -1)
7084             {
7085               if (dump_enabled_p ())
7086                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7087                                  "can't use a fully-masked loop for chained"
7088                                  " reductions.\n");
7089               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7090             }
7091           else
7092             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7093                                    vectype_in);
7094         }
7095       if (dump_enabled_p ()
7096           && reduction_type == FOLD_LEFT_REDUCTION)
7097         dump_printf_loc (MSG_NOTE, vect_location,
7098                          "using an in-order (fold-left) reduction.\n");
7099       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7100       return true;
7101     }
7102
7103   /* Transform.  */
7104
7105   if (dump_enabled_p ())
7106     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7107
7108   /* FORNOW: Multiple types are not supported for condition.  */
7109   if (code == COND_EXPR)
7110     gcc_assert (ncopies == 1);
7111
7112   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7113
7114   if (reduction_type == FOLD_LEFT_REDUCTION)
7115     return vectorize_fold_left_reduction
7116       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7117        reduc_fn, ops, vectype_in, reduc_index, masks);
7118
7119   if (reduction_type == EXTRACT_LAST_REDUCTION)
7120     {
7121       gcc_assert (!slp_node);
7122       return vectorizable_condition (stmt, gsi, vec_stmt,
7123                                      NULL, reduc_index, NULL, NULL);
7124     }
7125
7126   /* Create the destination vector  */
7127   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7128
7129   prev_stmt_info = NULL;
7130   prev_phi_info = NULL;
7131   if (!slp_node)
7132     {
7133       vec_oprnds0.create (1);
7134       vec_oprnds1.create (1);
7135       if (op_type == ternary_op)
7136         vec_oprnds2.create (1);
7137     }
7138
7139   phis.create (vec_num);
7140   vect_defs.create (vec_num);
7141   if (!slp_node)
7142     vect_defs.quick_push (NULL_TREE);
7143
7144   if (slp_node)
7145     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7146   else
7147     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7148
7149   for (j = 0; j < ncopies; j++)
7150     {
7151       if (code == COND_EXPR)
7152         {
7153           gcc_assert (!slp_node);
7154           vectorizable_condition (stmt, gsi, vec_stmt,
7155                                   PHI_RESULT (phis[0]),
7156                                   reduc_index, NULL, NULL);
7157           /* Multiple types are not supported for condition.  */
7158           break;
7159         }
7160
7161       /* Handle uses.  */
7162       if (j == 0)
7163         {
7164           if (slp_node)
7165             {
7166               /* Get vec defs for all the operands except the reduction index,
7167                  ensuring the ordering of the ops in the vector is kept.  */
7168               auto_vec<tree, 3> slp_ops;
7169               auto_vec<vec<tree>, 3> vec_defs;
7170
7171               slp_ops.quick_push (ops[0]);
7172               slp_ops.quick_push (ops[1]);
7173               if (op_type == ternary_op)
7174                 slp_ops.quick_push (ops[2]);
7175
7176               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7177
7178               vec_oprnds0.safe_splice (vec_defs[0]);
7179               vec_defs[0].release ();
7180               vec_oprnds1.safe_splice (vec_defs[1]);
7181               vec_defs[1].release ();
7182               if (op_type == ternary_op)
7183                 {
7184                   vec_oprnds2.safe_splice (vec_defs[2]);
7185                   vec_defs[2].release ();
7186                 }
7187             }
7188           else
7189             {
7190               vec_oprnds0.quick_push
7191                 (vect_get_vec_def_for_operand (ops[0], stmt));
7192               vec_oprnds1.quick_push
7193                 (vect_get_vec_def_for_operand (ops[1], stmt));
7194               if (op_type == ternary_op)
7195                 vec_oprnds2.quick_push
7196                   (vect_get_vec_def_for_operand (ops[2], stmt));
7197             }
7198         }
7199       else
7200         {
7201           if (!slp_node)
7202             {
7203               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7204
7205               if (single_defuse_cycle && reduc_index == 0)
7206                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7207               else
7208                 vec_oprnds0[0]
7209                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7210               if (single_defuse_cycle && reduc_index == 1)
7211                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7212               else
7213                 vec_oprnds1[0]
7214                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7215               if (op_type == ternary_op)
7216                 {
7217                   if (single_defuse_cycle && reduc_index == 2)
7218                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7219                   else
7220                     vec_oprnds2[0]
7221                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7222                 }
7223             }
7224         }
7225
7226       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7227         {
7228           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7229           if (masked_loop_p)
7230             {
7231               /* Make sure that the reduction accumulator is vop[0].  */
7232               if (reduc_index == 1)
7233                 {
7234                   gcc_assert (commutative_tree_code (code));
7235                   std::swap (vop[0], vop[1]);
7236                 }
7237               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7238                                               vectype_in, i * ncopies + j);
7239               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7240                                                         vop[0], vop[1],
7241                                                         vop[0]);
7242               new_temp = make_ssa_name (vec_dest, call);
7243               gimple_call_set_lhs (call, new_temp);
7244               gimple_call_set_nothrow (call, true);
7245               new_stmt = call;
7246             }
7247           else
7248             {
7249               if (op_type == ternary_op)
7250                 vop[2] = vec_oprnds2[i];
7251
7252               new_temp = make_ssa_name (vec_dest, new_stmt);
7253               new_stmt = gimple_build_assign (new_temp, code,
7254                                               vop[0], vop[1], vop[2]);
7255             }
7256           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7257
7258           if (slp_node)
7259             {
7260               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7261               vect_defs.quick_push (new_temp);
7262             }
7263           else
7264             vect_defs[0] = new_temp;
7265         }
7266
7267       if (slp_node)
7268         continue;
7269
7270       if (j == 0)
7271         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7272       else
7273         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7274
7275       prev_stmt_info = vinfo_for_stmt (new_stmt);
7276     }
7277
7278   /* Finalize the reduction-phi (set its arguments) and create the
7279      epilog reduction code.  */
7280   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7281     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7282
7283   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7284                                     epilog_copies, reduc_fn, phis,
7285                                     double_reduc, slp_node, slp_node_instance,
7286                                     cond_reduc_val, cond_reduc_op_code,
7287                                     neutral_op);
7288
7289   return true;
7290 }
7291
7292 /* Function vect_min_worthwhile_factor.
7293
7294    For a loop where we could vectorize the operation indicated by CODE,
7295    return the minimum vectorization factor that makes it worthwhile
7296    to use generic vectors.  */
7297 static unsigned int
7298 vect_min_worthwhile_factor (enum tree_code code)
7299 {
7300   switch (code)
7301     {
7302     case PLUS_EXPR:
7303     case MINUS_EXPR:
7304     case NEGATE_EXPR:
7305       return 4;
7306
7307     case BIT_AND_EXPR:
7308     case BIT_IOR_EXPR:
7309     case BIT_XOR_EXPR:
7310     case BIT_NOT_EXPR:
7311       return 2;
7312
7313     default:
7314       return INT_MAX;
7315     }
7316 }
7317
7318 /* Return true if VINFO indicates we are doing loop vectorization and if
7319    it is worth decomposing CODE operations into scalar operations for
7320    that loop's vectorization factor.  */
7321
7322 bool
7323 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7324 {
7325   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7326   unsigned HOST_WIDE_INT value;
7327   return (loop_vinfo
7328           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7329           && value >= vect_min_worthwhile_factor (code));
7330 }
7331
7332 /* Function vectorizable_induction
7333
7334    Check if PHI performs an induction computation that can be vectorized.
7335    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7336    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7337    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7338
7339 bool
7340 vectorizable_induction (gimple *phi,
7341                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7342                         gimple **vec_stmt, slp_tree slp_node,
7343                         stmt_vector_for_cost *cost_vec)
7344 {
7345   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7346   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7347   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7348   unsigned ncopies;
7349   bool nested_in_vect_loop = false;
7350   struct loop *iv_loop;
7351   tree vec_def;
7352   edge pe = loop_preheader_edge (loop);
7353   basic_block new_bb;
7354   tree new_vec, vec_init, vec_step, t;
7355   tree new_name;
7356   gimple *new_stmt;
7357   gphi *induction_phi;
7358   tree induc_def, vec_dest;
7359   tree init_expr, step_expr;
7360   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7361   unsigned i;
7362   tree expr;
7363   gimple_seq stmts;
7364   imm_use_iterator imm_iter;
7365   use_operand_p use_p;
7366   gimple *exit_phi;
7367   edge latch_e;
7368   tree loop_arg;
7369   gimple_stmt_iterator si;
7370   basic_block bb = gimple_bb (phi);
7371
7372   if (gimple_code (phi) != GIMPLE_PHI)
7373     return false;
7374
7375   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7376     return false;
7377
7378   /* Make sure it was recognized as induction computation.  */
7379   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7380     return false;
7381
7382   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7383   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7384
7385   if (slp_node)
7386     ncopies = 1;
7387   else
7388     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7389   gcc_assert (ncopies >= 1);
7390
7391   /* FORNOW. These restrictions should be relaxed.  */
7392   if (nested_in_vect_loop_p (loop, phi))
7393     {
7394       imm_use_iterator imm_iter;
7395       use_operand_p use_p;
7396       gimple *exit_phi;
7397       edge latch_e;
7398       tree loop_arg;
7399
7400       if (ncopies > 1)
7401         {
7402           if (dump_enabled_p ())
7403             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7404                              "multiple types in nested loop.\n");
7405           return false;
7406         }
7407
7408       /* FORNOW: outer loop induction with SLP not supported.  */
7409       if (STMT_SLP_TYPE (stmt_info))
7410         return false;
7411
7412       exit_phi = NULL;
7413       latch_e = loop_latch_edge (loop->inner);
7414       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7415       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7416         {
7417           gimple *use_stmt = USE_STMT (use_p);
7418           if (is_gimple_debug (use_stmt))
7419             continue;
7420
7421           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7422             {
7423               exit_phi = use_stmt;
7424               break;
7425             }
7426         }
7427       if (exit_phi)
7428         {
7429           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7430           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7431                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7432             {
7433               if (dump_enabled_p ())
7434                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7435                                  "inner-loop induction only used outside "
7436                                  "of the outer vectorized loop.\n");
7437               return false;
7438             }
7439         }
7440
7441       nested_in_vect_loop = true;
7442       iv_loop = loop->inner;
7443     }
7444   else
7445     iv_loop = loop;
7446   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7447
7448   if (slp_node && !nunits.is_constant ())
7449     {
7450       /* The current SLP code creates the initial value element-by-element.  */
7451       if (dump_enabled_p ())
7452         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7453                          "SLP induction not supported for variable-length"
7454                          " vectors.\n");
7455       return false;
7456     }
7457
7458   if (!vec_stmt) /* transformation not required.  */
7459     {
7460       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7461       if (dump_enabled_p ())
7462         dump_printf_loc (MSG_NOTE, vect_location,
7463                          "=== vectorizable_induction ===\n");
7464       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7465       return true;
7466     }
7467
7468   /* Transform.  */
7469
7470   /* Compute a vector variable, initialized with the first VF values of
7471      the induction variable.  E.g., for an iv with IV_PHI='X' and
7472      evolution S, for a vector of 4 units, we want to compute:
7473      [X, X + S, X + 2*S, X + 3*S].  */
7474
7475   if (dump_enabled_p ())
7476     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7477
7478   latch_e = loop_latch_edge (iv_loop);
7479   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7480
7481   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7482   gcc_assert (step_expr != NULL_TREE);
7483
7484   pe = loop_preheader_edge (iv_loop);
7485   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7486                                      loop_preheader_edge (iv_loop));
7487
7488   stmts = NULL;
7489   if (!nested_in_vect_loop)
7490     {
7491       /* Convert the initial value to the desired type.  */
7492       tree new_type = TREE_TYPE (vectype);
7493       init_expr = gimple_convert (&stmts, new_type, init_expr);
7494
7495       /* If we are using the loop mask to "peel" for alignment then we need
7496          to adjust the start value here.  */
7497       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7498       if (skip_niters != NULL_TREE)
7499         {
7500           if (FLOAT_TYPE_P (vectype))
7501             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7502                                         skip_niters);
7503           else
7504             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7505           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7506                                          skip_niters, step_expr);
7507           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7508                                     init_expr, skip_step);
7509         }
7510     }
7511
7512   /* Convert the step to the desired type.  */
7513   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7514
7515   if (stmts)
7516     {
7517       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7518       gcc_assert (!new_bb);
7519     }
7520
7521   /* Find the first insertion point in the BB.  */
7522   si = gsi_after_labels (bb);
7523
7524   /* For SLP induction we have to generate several IVs as for example
7525      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7526      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7527      [VF*S, VF*S, VF*S, VF*S] for all.  */
7528   if (slp_node)
7529     {
7530       /* Enforced above.  */
7531       unsigned int const_nunits = nunits.to_constant ();
7532
7533       /* Generate [VF*S, VF*S, ... ].  */
7534       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7535         {
7536           expr = build_int_cst (integer_type_node, vf);
7537           expr = fold_convert (TREE_TYPE (step_expr), expr);
7538         }
7539       else
7540         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7541       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7542                               expr, step_expr);
7543       if (! CONSTANT_CLASS_P (new_name))
7544         new_name = vect_init_vector (phi, new_name,
7545                                      TREE_TYPE (step_expr), NULL);
7546       new_vec = build_vector_from_val (vectype, new_name);
7547       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7548
7549       /* Now generate the IVs.  */
7550       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7551       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7552       unsigned elts = const_nunits * nvects;
7553       unsigned nivs = least_common_multiple (group_size,
7554                                              const_nunits) / const_nunits;
7555       gcc_assert (elts % group_size == 0);
7556       tree elt = init_expr;
7557       unsigned ivn;
7558       for (ivn = 0; ivn < nivs; ++ivn)
7559         {
7560           tree_vector_builder elts (vectype, const_nunits, 1);
7561           stmts = NULL;
7562           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7563             {
7564               if (ivn*const_nunits + eltn >= group_size
7565                   && (ivn * const_nunits + eltn) % group_size == 0)
7566                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7567                                     elt, step_expr);
7568               elts.quick_push (elt);
7569             }
7570           vec_init = gimple_build_vector (&stmts, &elts);
7571           if (stmts)
7572             {
7573               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7574               gcc_assert (!new_bb);
7575             }
7576
7577           /* Create the induction-phi that defines the induction-operand.  */
7578           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7579           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7580           set_vinfo_for_stmt (induction_phi,
7581                               new_stmt_vec_info (induction_phi, loop_vinfo));
7582           induc_def = PHI_RESULT (induction_phi);
7583
7584           /* Create the iv update inside the loop  */
7585           vec_def = make_ssa_name (vec_dest);
7586           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7587           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7588           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7589
7590           /* Set the arguments of the phi node:  */
7591           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7592           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7593                        UNKNOWN_LOCATION);
7594
7595           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7596         }
7597
7598       /* Re-use IVs when we can.  */
7599       if (ivn < nvects)
7600         {
7601           unsigned vfp
7602             = least_common_multiple (group_size, const_nunits) / group_size;
7603           /* Generate [VF'*S, VF'*S, ... ].  */
7604           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7605             {
7606               expr = build_int_cst (integer_type_node, vfp);
7607               expr = fold_convert (TREE_TYPE (step_expr), expr);
7608             }
7609           else
7610             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7611           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7612                                   expr, step_expr);
7613           if (! CONSTANT_CLASS_P (new_name))
7614             new_name = vect_init_vector (phi, new_name,
7615                                          TREE_TYPE (step_expr), NULL);
7616           new_vec = build_vector_from_val (vectype, new_name);
7617           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7618           for (; ivn < nvects; ++ivn)
7619             {
7620               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7621               tree def;
7622               if (gimple_code (iv) == GIMPLE_PHI)
7623                 def = gimple_phi_result (iv);
7624               else
7625                 def = gimple_assign_lhs (iv);
7626               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7627                                               PLUS_EXPR,
7628                                               def, vec_step);
7629               if (gimple_code (iv) == GIMPLE_PHI)
7630                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7631               else
7632                 {
7633                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7634                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7635                 }
7636               set_vinfo_for_stmt (new_stmt,
7637                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7638               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7639             }
7640         }
7641
7642       return true;
7643     }
7644
7645   /* Create the vector that holds the initial_value of the induction.  */
7646   if (nested_in_vect_loop)
7647     {
7648       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7649          been created during vectorization of previous stmts.  We obtain it
7650          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7651       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7652       /* If the initial value is not of proper type, convert it.  */
7653       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7654         {
7655           new_stmt
7656             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7657                                                           vect_simple_var,
7658                                                           "vec_iv_"),
7659                                    VIEW_CONVERT_EXPR,
7660                                    build1 (VIEW_CONVERT_EXPR, vectype,
7661                                            vec_init));
7662           vec_init = gimple_assign_lhs (new_stmt);
7663           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7664                                                  new_stmt);
7665           gcc_assert (!new_bb);
7666           set_vinfo_for_stmt (new_stmt,
7667                               new_stmt_vec_info (new_stmt, loop_vinfo));
7668         }
7669     }
7670   else
7671     {
7672       /* iv_loop is the loop to be vectorized. Create:
7673          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7674       stmts = NULL;
7675       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7676
7677       unsigned HOST_WIDE_INT const_nunits;
7678       if (nunits.is_constant (&const_nunits))
7679         {
7680           tree_vector_builder elts (vectype, const_nunits, 1);
7681           elts.quick_push (new_name);
7682           for (i = 1; i < const_nunits; i++)
7683             {
7684               /* Create: new_name_i = new_name + step_expr  */
7685               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7686                                        new_name, step_expr);
7687               elts.quick_push (new_name);
7688             }
7689           /* Create a vector from [new_name_0, new_name_1, ...,
7690              new_name_nunits-1]  */
7691           vec_init = gimple_build_vector (&stmts, &elts);
7692         }
7693       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7694         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7695         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7696                                  new_name, step_expr);
7697       else
7698         {
7699           /* Build:
7700                 [base, base, base, ...]
7701                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7702           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7703           gcc_assert (flag_associative_math);
7704           tree index = build_index_vector (vectype, 0, 1);
7705           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7706                                                         new_name);
7707           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7708                                                         step_expr);
7709           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7710           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7711                                    vec_init, step_vec);
7712           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7713                                    vec_init, base_vec);
7714         }
7715
7716       if (stmts)
7717         {
7718           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7719           gcc_assert (!new_bb);
7720         }
7721     }
7722
7723
7724   /* Create the vector that holds the step of the induction.  */
7725   if (nested_in_vect_loop)
7726     /* iv_loop is nested in the loop to be vectorized. Generate:
7727        vec_step = [S, S, S, S]  */
7728     new_name = step_expr;
7729   else
7730     {
7731       /* iv_loop is the loop to be vectorized. Generate:
7732           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7733       gimple_seq seq = NULL;
7734       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7735         {
7736           expr = build_int_cst (integer_type_node, vf);
7737           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7738         }
7739       else
7740         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7741       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7742                                expr, step_expr);
7743       if (seq)
7744         {
7745           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7746           gcc_assert (!new_bb);
7747         }
7748     }
7749
7750   t = unshare_expr (new_name);
7751   gcc_assert (CONSTANT_CLASS_P (new_name)
7752               || TREE_CODE (new_name) == SSA_NAME);
7753   new_vec = build_vector_from_val (vectype, t);
7754   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7755
7756
7757   /* Create the following def-use cycle:
7758      loop prolog:
7759          vec_init = ...
7760          vec_step = ...
7761      loop:
7762          vec_iv = PHI <vec_init, vec_loop>
7763          ...
7764          STMT
7765          ...
7766          vec_loop = vec_iv + vec_step;  */
7767
7768   /* Create the induction-phi that defines the induction-operand.  */
7769   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7770   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7771   set_vinfo_for_stmt (induction_phi,
7772                       new_stmt_vec_info (induction_phi, loop_vinfo));
7773   induc_def = PHI_RESULT (induction_phi);
7774
7775   /* Create the iv update inside the loop  */
7776   vec_def = make_ssa_name (vec_dest);
7777   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7778   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7779   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7780
7781   /* Set the arguments of the phi node:  */
7782   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7783   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7784                UNKNOWN_LOCATION);
7785
7786   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7787
7788   /* In case that vectorization factor (VF) is bigger than the number
7789      of elements that we can fit in a vectype (nunits), we have to generate
7790      more than one vector stmt - i.e - we need to "unroll" the
7791      vector stmt by a factor VF/nunits.  For more details see documentation
7792      in vectorizable_operation.  */
7793
7794   if (ncopies > 1)
7795     {
7796       gimple_seq seq = NULL;
7797       stmt_vec_info prev_stmt_vinfo;
7798       /* FORNOW. This restriction should be relaxed.  */
7799       gcc_assert (!nested_in_vect_loop);
7800
7801       /* Create the vector that holds the step of the induction.  */
7802       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7803         {
7804           expr = build_int_cst (integer_type_node, nunits);
7805           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7806         }
7807       else
7808         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7809       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7810                                expr, step_expr);
7811       if (seq)
7812         {
7813           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7814           gcc_assert (!new_bb);
7815         }
7816
7817       t = unshare_expr (new_name);
7818       gcc_assert (CONSTANT_CLASS_P (new_name)
7819                   || TREE_CODE (new_name) == SSA_NAME);
7820       new_vec = build_vector_from_val (vectype, t);
7821       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7822
7823       vec_def = induc_def;
7824       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7825       for (i = 1; i < ncopies; i++)
7826         {
7827           /* vec_i = vec_prev + vec_step  */
7828           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7829                                           vec_def, vec_step);
7830           vec_def = make_ssa_name (vec_dest, new_stmt);
7831           gimple_assign_set_lhs (new_stmt, vec_def);
7832
7833           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7834           set_vinfo_for_stmt (new_stmt,
7835                               new_stmt_vec_info (new_stmt, loop_vinfo));
7836           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7837           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7838         }
7839     }
7840
7841   if (nested_in_vect_loop)
7842     {
7843       /* Find the loop-closed exit-phi of the induction, and record
7844          the final vector of induction results:  */
7845       exit_phi = NULL;
7846       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7847         {
7848           gimple *use_stmt = USE_STMT (use_p);
7849           if (is_gimple_debug (use_stmt))
7850             continue;
7851
7852           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7853             {
7854               exit_phi = use_stmt;
7855               break;
7856             }
7857         }
7858       if (exit_phi)
7859         {
7860           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7861           /* FORNOW. Currently not supporting the case that an inner-loop induction
7862              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7863           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7864                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7865
7866           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7867           if (dump_enabled_p ())
7868             {
7869               dump_printf_loc (MSG_NOTE, vect_location,
7870                                "vector of inductions after inner-loop:");
7871               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7872             }
7873         }
7874     }
7875
7876
7877   if (dump_enabled_p ())
7878     {
7879       dump_printf_loc (MSG_NOTE, vect_location,
7880                        "transform induction: created def-use cycle: ");
7881       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7882       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7883                         SSA_NAME_DEF_STMT (vec_def), 0);
7884     }
7885
7886   return true;
7887 }
7888
7889 /* Function vectorizable_live_operation.
7890
7891    STMT computes a value that is used outside the loop.  Check if
7892    it can be supported.  */
7893
7894 bool
7895 vectorizable_live_operation (gimple *stmt,
7896                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7897                              slp_tree slp_node, int slp_index,
7898                              gimple **vec_stmt,
7899                              stmt_vector_for_cost *)
7900 {
7901   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7902   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7903   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7904   imm_use_iterator imm_iter;
7905   tree lhs, lhs_type, bitsize, vec_bitsize;
7906   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7907   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7908   int ncopies;
7909   gimple *use_stmt;
7910   auto_vec<tree> vec_oprnds;
7911   int vec_entry = 0;
7912   poly_uint64 vec_index = 0;
7913
7914   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7915
7916   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7917     return false;
7918
7919   /* FORNOW.  CHECKME.  */
7920   if (nested_in_vect_loop_p (loop, stmt))
7921     return false;
7922
7923   /* If STMT is not relevant and it is a simple assignment and its inputs are
7924      invariant then it can remain in place, unvectorized.  The original last
7925      scalar value that it computes will be used.  */
7926   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7927     {
7928       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7929       if (dump_enabled_p ())
7930         dump_printf_loc (MSG_NOTE, vect_location,
7931                          "statement is simple and uses invariant.  Leaving in "
7932                          "place.\n");
7933       return true;
7934     }
7935
7936   if (slp_node)
7937     ncopies = 1;
7938   else
7939     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7940
7941   if (slp_node)
7942     {
7943       gcc_assert (slp_index >= 0);
7944
7945       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7946       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7947
7948       /* Get the last occurrence of the scalar index from the concatenation of
7949          all the slp vectors. Calculate which slp vector it is and the index
7950          within.  */
7951       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7952
7953       /* Calculate which vector contains the result, and which lane of
7954          that vector we need.  */
7955       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7956         {
7957           if (dump_enabled_p ())
7958             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7959                              "Cannot determine which vector holds the"
7960                              " final result.\n");
7961           return false;
7962         }
7963     }
7964
7965   if (!vec_stmt)
7966     {
7967       /* No transformation required.  */
7968       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7969         {
7970           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7971                                                OPTIMIZE_FOR_SPEED))
7972             {
7973               if (dump_enabled_p ())
7974                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7975                                  "can't use a fully-masked loop because "
7976                                  "the target doesn't support extract last "
7977                                  "reduction.\n");
7978               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7979             }
7980           else if (slp_node)
7981             {
7982               if (dump_enabled_p ())
7983                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7984                                  "can't use a fully-masked loop because an "
7985                                  "SLP statement is live after the loop.\n");
7986               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7987             }
7988           else if (ncopies > 1)
7989             {
7990               if (dump_enabled_p ())
7991                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7992                                  "can't use a fully-masked loop because"
7993                                  " ncopies is greater than 1.\n");
7994               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7995             }
7996           else
7997             {
7998               gcc_assert (ncopies == 1 && !slp_node);
7999               vect_record_loop_mask (loop_vinfo,
8000                                      &LOOP_VINFO_MASKS (loop_vinfo),
8001                                      1, vectype);
8002             }
8003         }
8004       return true;
8005     }
8006
8007   /* If stmt has a related stmt, then use that for getting the lhs.  */
8008   if (is_pattern_stmt_p (stmt_info))
8009     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8010
8011   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8012         : gimple_get_lhs (stmt);
8013   lhs_type = TREE_TYPE (lhs);
8014
8015   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8016              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8017              : TYPE_SIZE (TREE_TYPE (vectype)));
8018   vec_bitsize = TYPE_SIZE (vectype);
8019
8020   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8021   tree vec_lhs, bitstart;
8022   if (slp_node)
8023     {
8024       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8025
8026       /* Get the correct slp vectorized stmt.  */
8027       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8028       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8029         vec_lhs = gimple_phi_result (phi);
8030       else
8031         vec_lhs = gimple_get_lhs (vec_stmt);
8032
8033       /* Get entry to use.  */
8034       bitstart = bitsize_int (vec_index);
8035       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8036     }
8037   else
8038     {
8039       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8040       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8041       gcc_checking_assert (ncopies == 1
8042                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8043
8044       /* For multiple copies, get the last copy.  */
8045       for (int i = 1; i < ncopies; ++i)
8046         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8047                                                   vec_lhs);
8048
8049       /* Get the last lane in the vector.  */
8050       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8051     }
8052
8053   gimple_seq stmts = NULL;
8054   tree new_tree;
8055   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8056     {
8057       /* Emit:
8058
8059            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8060
8061          where VEC_LHS is the vectorized live-out result and MASK is
8062          the loop mask for the final iteration.  */
8063       gcc_assert (ncopies == 1 && !slp_node);
8064       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8065       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8066                                       1, vectype, 0);
8067       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8068                                       scalar_type, mask, vec_lhs);
8069
8070       /* Convert the extracted vector element to the required scalar type.  */
8071       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8072     }
8073   else
8074     {
8075       tree bftype = TREE_TYPE (vectype);
8076       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8077         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8078       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8079       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8080                                        &stmts, true, NULL_TREE);
8081     }
8082
8083   if (stmts)
8084     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8085
8086   /* Replace use of lhs with newly computed result.  If the use stmt is a
8087      single arg PHI, just replace all uses of PHI result.  It's necessary
8088      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8089   use_operand_p use_p;
8090   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8091     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8092         && !is_gimple_debug (use_stmt))
8093     {
8094       if (gimple_code (use_stmt) == GIMPLE_PHI
8095           && gimple_phi_num_args (use_stmt) == 1)
8096         {
8097           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8098         }
8099       else
8100         {
8101           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8102             SET_USE (use_p, new_tree);
8103         }
8104       update_stmt (use_stmt);
8105     }
8106
8107   return true;
8108 }
8109
8110 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8111
8112 static void
8113 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8114 {
8115   ssa_op_iter op_iter;
8116   imm_use_iterator imm_iter;
8117   def_operand_p def_p;
8118   gimple *ustmt;
8119
8120   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8121     {
8122       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8123         {
8124           basic_block bb;
8125
8126           if (!is_gimple_debug (ustmt))
8127             continue;
8128
8129           bb = gimple_bb (ustmt);
8130
8131           if (!flow_bb_inside_loop_p (loop, bb))
8132             {
8133               if (gimple_debug_bind_p (ustmt))
8134                 {
8135                   if (dump_enabled_p ())
8136                     dump_printf_loc (MSG_NOTE, vect_location,
8137                                      "killing debug use\n");
8138
8139                   gimple_debug_bind_reset_value (ustmt);
8140                   update_stmt (ustmt);
8141                 }
8142               else
8143                 gcc_unreachable ();
8144             }
8145         }
8146     }
8147 }
8148
8149 /* Given loop represented by LOOP_VINFO, return true if computation of
8150    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8151    otherwise.  */
8152
8153 static bool
8154 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8155 {
8156   /* Constant case.  */
8157   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8158     {
8159       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8160       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8161
8162       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8163       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8164       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8165         return true;
8166     }
8167
8168   widest_int max;
8169   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8170   /* Check the upper bound of loop niters.  */
8171   if (get_max_loop_iterations (loop, &max))
8172     {
8173       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8174       signop sgn = TYPE_SIGN (type);
8175       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8176       if (max < type_max)
8177         return true;
8178     }
8179   return false;
8180 }
8181
8182 /* Return a mask type with half the number of elements as TYPE.  */
8183
8184 tree
8185 vect_halve_mask_nunits (tree type)
8186 {
8187   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8188   return build_truth_vector_type (nunits, current_vector_size);
8189 }
8190
8191 /* Return a mask type with twice as many elements as TYPE.  */
8192
8193 tree
8194 vect_double_mask_nunits (tree type)
8195 {
8196   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8197   return build_truth_vector_type (nunits, current_vector_size);
8198 }
8199
8200 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8201    contain a sequence of NVECTORS masks that each control a vector of type
8202    VECTYPE.  */
8203
8204 void
8205 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8206                        unsigned int nvectors, tree vectype)
8207 {
8208   gcc_assert (nvectors != 0);
8209   if (masks->length () < nvectors)
8210     masks->safe_grow_cleared (nvectors);
8211   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8212   /* The number of scalars per iteration and the number of vectors are
8213      both compile-time constants.  */
8214   unsigned int nscalars_per_iter
8215     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8216                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8217   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8218     {
8219       rgm->max_nscalars_per_iter = nscalars_per_iter;
8220       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8221     }
8222 }
8223
8224 /* Given a complete set of masks MASKS, extract mask number INDEX
8225    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8226    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8227
8228    See the comment above vec_loop_masks for more details about the mask
8229    arrangement.  */
8230
8231 tree
8232 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8233                     unsigned int nvectors, tree vectype, unsigned int index)
8234 {
8235   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8236   tree mask_type = rgm->mask_type;
8237
8238   /* Populate the rgroup's mask array, if this is the first time we've
8239      used it.  */
8240   if (rgm->masks.is_empty ())
8241     {
8242       rgm->masks.safe_grow_cleared (nvectors);
8243       for (unsigned int i = 0; i < nvectors; ++i)
8244         {
8245           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8246           /* Provide a dummy definition until the real one is available.  */
8247           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8248           rgm->masks[i] = mask;
8249         }
8250     }
8251
8252   tree mask = rgm->masks[index];
8253   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8254                 TYPE_VECTOR_SUBPARTS (vectype)))
8255     {
8256       /* A loop mask for data type X can be reused for data type Y
8257          if X has N times more elements than Y and if Y's elements
8258          are N times bigger than X's.  In this case each sequence
8259          of N elements in the loop mask will be all-zero or all-one.
8260          We can then view-convert the mask so that each sequence of
8261          N elements is replaced by a single element.  */
8262       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8263                               TYPE_VECTOR_SUBPARTS (vectype)));
8264       gimple_seq seq = NULL;
8265       mask_type = build_same_sized_truth_vector_type (vectype);
8266       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8267       if (seq)
8268         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8269     }
8270   return mask;
8271 }
8272
8273 /* Scale profiling counters by estimation for LOOP which is vectorized
8274    by factor VF.  */
8275
8276 static void
8277 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8278 {
8279   edge preheader = loop_preheader_edge (loop);
8280   /* Reduce loop iterations by the vectorization factor.  */
8281   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8282   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8283
8284   if (freq_h.nonzero_p ())
8285     {
8286       profile_probability p;
8287
8288       /* Avoid dropping loop body profile counter to 0 because of zero count
8289          in loop's preheader.  */
8290       if (!(freq_e == profile_count::zero ()))
8291         freq_e = freq_e.force_nonzero ();
8292       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8293       scale_loop_frequencies (loop, p);
8294     }
8295
8296   edge exit_e = single_exit (loop);
8297   exit_e->probability = profile_probability::always ()
8298                                  .apply_scale (1, new_est_niter + 1);
8299
8300   edge exit_l = single_pred_edge (loop->latch);
8301   profile_probability prob = exit_l->probability;
8302   exit_l->probability = exit_e->probability.invert ();
8303   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8304     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8305 }
8306
8307 /* Function vect_transform_loop.
8308
8309    The analysis phase has determined that the loop is vectorizable.
8310    Vectorize the loop - created vectorized stmts to replace the scalar
8311    stmts in the loop, and update the loop exit condition.
8312    Returns scalar epilogue loop if any.  */
8313
8314 struct loop *
8315 vect_transform_loop (loop_vec_info loop_vinfo)
8316 {
8317   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8318   struct loop *epilogue = NULL;
8319   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8320   int nbbs = loop->num_nodes;
8321   int i;
8322   tree niters_vector = NULL_TREE;
8323   tree step_vector = NULL_TREE;
8324   tree niters_vector_mult_vf = NULL_TREE;
8325   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8326   unsigned int lowest_vf = constant_lower_bound (vf);
8327   bool grouped_store;
8328   bool slp_scheduled = false;
8329   gimple *stmt, *pattern_stmt;
8330   gimple_seq pattern_def_seq = NULL;
8331   gimple_stmt_iterator pattern_def_si = gsi_none ();
8332   bool transform_pattern_stmt = false;
8333   bool check_profitability = false;
8334   unsigned int th;
8335
8336   if (dump_enabled_p ())
8337     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8338
8339   /* Use the more conservative vectorization threshold.  If the number
8340      of iterations is constant assume the cost check has been performed
8341      by our caller.  If the threshold makes all loops profitable that
8342      run at least the (estimated) vectorization factor number of times
8343      checking is pointless, too.  */
8344   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8345   if (th >= vect_vf_for_cost (loop_vinfo)
8346       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8347     {
8348       if (dump_enabled_p ())
8349         dump_printf_loc (MSG_NOTE, vect_location,
8350                          "Profitability threshold is %d loop iterations.\n",
8351                          th);
8352       check_profitability = true;
8353     }
8354
8355   /* Make sure there exists a single-predecessor exit bb.  Do this before
8356      versioning.   */
8357   edge e = single_exit (loop);
8358   if (! single_pred_p (e->dest))
8359     {
8360       split_loop_exit_edge (e);
8361       if (dump_enabled_p ())
8362         dump_printf (MSG_NOTE, "split exit edge\n");
8363     }
8364
8365   /* Version the loop first, if required, so the profitability check
8366      comes first.  */
8367
8368   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8369     {
8370       poly_uint64 versioning_threshold
8371         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8372       if (check_profitability
8373           && ordered_p (poly_uint64 (th), versioning_threshold))
8374         {
8375           versioning_threshold = ordered_max (poly_uint64 (th),
8376                                               versioning_threshold);
8377           check_profitability = false;
8378         }
8379       vect_loop_versioning (loop_vinfo, th, check_profitability,
8380                             versioning_threshold);
8381       check_profitability = false;
8382     }
8383
8384   /* Make sure there exists a single-predecessor exit bb also on the
8385      scalar loop copy.  Do this after versioning but before peeling
8386      so CFG structure is fine for both scalar and if-converted loop
8387      to make slpeel_duplicate_current_defs_from_edges face matched
8388      loop closed PHI nodes on the exit.  */
8389   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8390     {
8391       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8392       if (! single_pred_p (e->dest))
8393         {
8394           split_loop_exit_edge (e);
8395           if (dump_enabled_p ())
8396             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8397         }
8398     }
8399
8400   tree niters = vect_build_loop_niters (loop_vinfo);
8401   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8402   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8403   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8404   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8405                               &step_vector, &niters_vector_mult_vf, th,
8406                               check_profitability, niters_no_overflow);
8407
8408   if (niters_vector == NULL_TREE)
8409     {
8410       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8411           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8412           && known_eq (lowest_vf, vf))
8413         {
8414           niters_vector
8415             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8416                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8417           step_vector = build_one_cst (TREE_TYPE (niters));
8418         }
8419       else
8420         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8421                                      &step_vector, niters_no_overflow);
8422     }
8423
8424   /* 1) Make sure the loop header has exactly two entries
8425      2) Make sure we have a preheader basic block.  */
8426
8427   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8428
8429   split_edge (loop_preheader_edge (loop));
8430
8431   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8432       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8433     /* This will deal with any possible peeling.  */
8434     vect_prepare_for_masked_peels (loop_vinfo);
8435
8436   /* FORNOW: the vectorizer supports only loops which body consist
8437      of one basic block (header + empty latch). When the vectorizer will
8438      support more involved loop forms, the order by which the BBs are
8439      traversed need to be reconsidered.  */
8440
8441   for (i = 0; i < nbbs; i++)
8442     {
8443       basic_block bb = bbs[i];
8444       stmt_vec_info stmt_info;
8445
8446       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8447            gsi_next (&si))
8448         {
8449           gphi *phi = si.phi ();
8450           if (dump_enabled_p ())
8451             {
8452               dump_printf_loc (MSG_NOTE, vect_location,
8453                                "------>vectorizing phi: ");
8454               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8455             }
8456           stmt_info = vinfo_for_stmt (phi);
8457           if (!stmt_info)
8458             continue;
8459
8460           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8461             vect_loop_kill_debug_uses (loop, phi);
8462
8463           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8464               && !STMT_VINFO_LIVE_P (stmt_info))
8465             continue;
8466
8467           if (STMT_VINFO_VECTYPE (stmt_info)
8468               && (maybe_ne
8469                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8470               && dump_enabled_p ())
8471             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8472
8473           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8474                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8475                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8476               && ! PURE_SLP_STMT (stmt_info))
8477             {
8478               if (dump_enabled_p ())
8479                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8480               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8481             }
8482         }
8483
8484       pattern_stmt = NULL;
8485       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8486            !gsi_end_p (si) || transform_pattern_stmt;)
8487         {
8488           bool is_store;
8489
8490           if (transform_pattern_stmt)
8491             stmt = pattern_stmt;
8492           else
8493             {
8494               stmt = gsi_stmt (si);
8495               /* During vectorization remove existing clobber stmts.  */
8496               if (gimple_clobber_p (stmt))
8497                 {
8498                   unlink_stmt_vdef (stmt);
8499                   gsi_remove (&si, true);
8500                   release_defs (stmt);
8501                   continue;
8502                 }
8503             }
8504
8505           if (dump_enabled_p ())
8506             {
8507               dump_printf_loc (MSG_NOTE, vect_location,
8508                                "------>vectorizing statement: ");
8509               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8510             }
8511
8512           stmt_info = vinfo_for_stmt (stmt);
8513
8514           /* vector stmts created in the outer-loop during vectorization of
8515              stmts in an inner-loop may not have a stmt_info, and do not
8516              need to be vectorized.  */
8517           if (!stmt_info)
8518             {
8519               gsi_next (&si);
8520               continue;
8521             }
8522
8523           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8524             vect_loop_kill_debug_uses (loop, stmt);
8525
8526           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8527               && !STMT_VINFO_LIVE_P (stmt_info))
8528             {
8529               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8530                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8531                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8532                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8533                 {
8534                   stmt = pattern_stmt;
8535                   stmt_info = vinfo_for_stmt (stmt);
8536                 }
8537               else
8538                 {
8539                   gsi_next (&si);
8540                   continue;
8541                 }
8542             }
8543           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8544                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8545                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8546                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8547             transform_pattern_stmt = true;
8548
8549           /* If pattern statement has def stmts, vectorize them too.  */
8550           if (is_pattern_stmt_p (stmt_info))
8551             {
8552               if (pattern_def_seq == NULL)
8553                 {
8554                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8555                   pattern_def_si = gsi_start (pattern_def_seq);
8556                 }
8557               else if (!gsi_end_p (pattern_def_si))
8558                 gsi_next (&pattern_def_si);
8559               if (pattern_def_seq != NULL)
8560                 {
8561                   gimple *pattern_def_stmt = NULL;
8562                   stmt_vec_info pattern_def_stmt_info = NULL;
8563
8564                   while (!gsi_end_p (pattern_def_si))
8565                     {
8566                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8567                       pattern_def_stmt_info
8568                         = vinfo_for_stmt (pattern_def_stmt);
8569                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8570                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8571                         break;
8572                       gsi_next (&pattern_def_si);
8573                     }
8574
8575                   if (!gsi_end_p (pattern_def_si))
8576                     {
8577                       if (dump_enabled_p ())
8578                         {
8579                           dump_printf_loc (MSG_NOTE, vect_location,
8580                                            "==> vectorizing pattern def "
8581                                            "stmt: ");
8582                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8583                                             pattern_def_stmt, 0);
8584                         }
8585
8586                       stmt = pattern_def_stmt;
8587                       stmt_info = pattern_def_stmt_info;
8588                     }
8589                   else
8590                     {
8591                       pattern_def_si = gsi_none ();
8592                       transform_pattern_stmt = false;
8593                     }
8594                 }
8595               else
8596                 transform_pattern_stmt = false;
8597             }
8598
8599           if (STMT_VINFO_VECTYPE (stmt_info))
8600             {
8601               poly_uint64 nunits
8602                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8603               if (!STMT_SLP_TYPE (stmt_info)
8604                   && maybe_ne (nunits, vf)
8605                   && dump_enabled_p ())
8606                   /* For SLP VF is set according to unrolling factor, and not
8607                      to vector size, hence for SLP this print is not valid.  */
8608                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8609             }
8610
8611           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8612              reached.  */
8613           if (STMT_SLP_TYPE (stmt_info))
8614             {
8615               if (!slp_scheduled)
8616                 {
8617                   slp_scheduled = true;
8618
8619                   if (dump_enabled_p ())
8620                     dump_printf_loc (MSG_NOTE, vect_location,
8621                                      "=== scheduling SLP instances ===\n");
8622
8623                   vect_schedule_slp (loop_vinfo);
8624                 }
8625
8626               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8627               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8628                 {
8629                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8630                     {
8631                       pattern_def_seq = NULL;
8632                       gsi_next (&si);
8633                     }
8634                   continue;
8635                 }
8636             }
8637
8638           /* -------- vectorize statement ------------ */
8639           if (dump_enabled_p ())
8640             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8641
8642           grouped_store = false;
8643           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8644           if (is_store)
8645             {
8646               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8647                 {
8648                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8649                      interleaving chain was completed - free all the stores in
8650                      the chain.  */
8651                   gsi_next (&si);
8652                   vect_remove_stores (DR_GROUP_FIRST_ELEMENT (stmt_info));
8653                 }
8654               else
8655                 {
8656                   /* Free the attached stmt_vec_info and remove the stmt.  */
8657                   gimple *store = gsi_stmt (si);
8658                   free_stmt_vec_info (store);
8659                   unlink_stmt_vdef (store);
8660                   gsi_remove (&si, true);
8661                   release_defs (store);
8662                 }
8663
8664               /* Stores can only appear at the end of pattern statements.  */
8665               gcc_assert (!transform_pattern_stmt);
8666               pattern_def_seq = NULL;
8667             }
8668           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8669             {
8670               pattern_def_seq = NULL;
8671               gsi_next (&si);
8672             }
8673         }                       /* stmts in BB */
8674
8675       /* Stub out scalar statements that must not survive vectorization.
8676          Doing this here helps with grouped statements, or statements that
8677          are involved in patterns.  */
8678       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8679            !gsi_end_p (gsi); gsi_next (&gsi))
8680         {
8681           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8682           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8683             {
8684               tree lhs = gimple_get_lhs (call);
8685               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8686                 {
8687                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8688                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8689                   gsi_replace (&gsi, new_stmt, true);
8690                 }
8691             }
8692         }
8693     }                           /* BBs in loop */
8694
8695   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8696      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8697   if (integer_onep (step_vector))
8698     niters_no_overflow = true;
8699   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8700                            niters_vector_mult_vf, !niters_no_overflow);
8701
8702   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8703   scale_profile_for_vect_loop (loop, assumed_vf);
8704
8705   /* True if the final iteration might not handle a full vector's
8706      worth of scalar iterations.  */
8707   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8708   /* The minimum number of iterations performed by the epilogue.  This
8709      is 1 when peeling for gaps because we always need a final scalar
8710      iteration.  */
8711   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8712   /* +1 to convert latch counts to loop iteration counts,
8713      -min_epilogue_iters to remove iterations that cannot be performed
8714        by the vector code.  */
8715   int bias_for_lowest = 1 - min_epilogue_iters;
8716   int bias_for_assumed = bias_for_lowest;
8717   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8718   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8719     {
8720       /* When the amount of peeling is known at compile time, the first
8721          iteration will have exactly alignment_npeels active elements.
8722          In the worst case it will have at least one.  */
8723       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8724       bias_for_lowest += lowest_vf - min_first_active;
8725       bias_for_assumed += assumed_vf - min_first_active;
8726     }
8727   /* In these calculations the "- 1" converts loop iteration counts
8728      back to latch counts.  */
8729   if (loop->any_upper_bound)
8730     loop->nb_iterations_upper_bound
8731       = (final_iter_may_be_partial
8732          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8733                           lowest_vf) - 1
8734          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8735                            lowest_vf) - 1);
8736   if (loop->any_likely_upper_bound)
8737     loop->nb_iterations_likely_upper_bound
8738       = (final_iter_may_be_partial
8739          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8740                           + bias_for_lowest, lowest_vf) - 1
8741          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8742                            + bias_for_lowest, lowest_vf) - 1);
8743   if (loop->any_estimate)
8744     loop->nb_iterations_estimate
8745       = (final_iter_may_be_partial
8746          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8747                           assumed_vf) - 1
8748          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8749                            assumed_vf) - 1);
8750
8751   if (dump_enabled_p ())
8752     {
8753       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8754         {
8755           dump_printf_loc (MSG_NOTE, vect_location,
8756                            "LOOP VECTORIZED\n");
8757           if (loop->inner)
8758             dump_printf_loc (MSG_NOTE, vect_location,
8759                              "OUTER LOOP VECTORIZED\n");
8760           dump_printf (MSG_NOTE, "\n");
8761         }
8762       else
8763         {
8764           dump_printf_loc (MSG_NOTE, vect_location,
8765                            "LOOP EPILOGUE VECTORIZED (VS=");
8766           dump_dec (MSG_NOTE, current_vector_size);
8767           dump_printf (MSG_NOTE, ")\n");
8768         }
8769     }
8770
8771   /* Free SLP instances here because otherwise stmt reference counting
8772      won't work.  */
8773   slp_instance instance;
8774   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8775     vect_free_slp_instance (instance);
8776   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8777   /* Clear-up safelen field since its value is invalid after vectorization
8778      since vectorized loop can have loop-carried dependencies.  */
8779   loop->safelen = 0;
8780
8781   /* Don't vectorize epilogue for epilogue.  */
8782   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8783     epilogue = NULL;
8784
8785   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8786     epilogue = NULL;
8787
8788   if (epilogue)
8789     {
8790       auto_vector_sizes vector_sizes;
8791       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8792       unsigned int next_size = 0;
8793
8794       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8795           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8796           && known_eq (vf, lowest_vf))
8797         {
8798           unsigned int eiters
8799             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8800                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8801           eiters = eiters % lowest_vf;
8802           epilogue->nb_iterations_upper_bound = eiters - 1;
8803
8804           unsigned int ratio;
8805           while (next_size < vector_sizes.length ()
8806                  && !(constant_multiple_p (current_vector_size,
8807                                            vector_sizes[next_size], &ratio)
8808                       && eiters >= lowest_vf / ratio))
8809             next_size += 1;
8810         }
8811       else
8812         while (next_size < vector_sizes.length ()
8813                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8814           next_size += 1;
8815
8816       if (next_size == vector_sizes.length ())
8817         epilogue = NULL;
8818     }
8819
8820   if (epilogue)
8821     {
8822       epilogue->force_vectorize = loop->force_vectorize;
8823       epilogue->safelen = loop->safelen;
8824       epilogue->dont_vectorize = false;
8825
8826       /* We may need to if-convert epilogue to vectorize it.  */
8827       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8828         tree_if_conversion (epilogue);
8829     }
8830
8831   return epilogue;
8832 }
8833
8834 /* The code below is trying to perform simple optimization - revert
8835    if-conversion for masked stores, i.e. if the mask of a store is zero
8836    do not perform it and all stored value producers also if possible.
8837    For example,
8838      for (i=0; i<n; i++)
8839        if (c[i])
8840         {
8841           p1[i] += 1;
8842           p2[i] = p3[i] +2;
8843         }
8844    this transformation will produce the following semi-hammock:
8845
8846    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8847      {
8848        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8849        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8850        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8851        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8852        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8853        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8854      }
8855 */
8856
8857 void
8858 optimize_mask_stores (struct loop *loop)
8859 {
8860   basic_block *bbs = get_loop_body (loop);
8861   unsigned nbbs = loop->num_nodes;
8862   unsigned i;
8863   basic_block bb;
8864   struct loop *bb_loop;
8865   gimple_stmt_iterator gsi;
8866   gimple *stmt;
8867   auto_vec<gimple *> worklist;
8868
8869   vect_location = find_loop_location (loop);
8870   /* Pick up all masked stores in loop if any.  */
8871   for (i = 0; i < nbbs; i++)
8872     {
8873       bb = bbs[i];
8874       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8875            gsi_next (&gsi))
8876         {
8877           stmt = gsi_stmt (gsi);
8878           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8879             worklist.safe_push (stmt);
8880         }
8881     }
8882
8883   free (bbs);
8884   if (worklist.is_empty ())
8885     return;
8886
8887   /* Loop has masked stores.  */
8888   while (!worklist.is_empty ())
8889     {
8890       gimple *last, *last_store;
8891       edge e, efalse;
8892       tree mask;
8893       basic_block store_bb, join_bb;
8894       gimple_stmt_iterator gsi_to;
8895       tree vdef, new_vdef;
8896       gphi *phi;
8897       tree vectype;
8898       tree zero;
8899
8900       last = worklist.pop ();
8901       mask = gimple_call_arg (last, 2);
8902       bb = gimple_bb (last);
8903       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8904          the same loop as if_bb.  It could be different to LOOP when two
8905          level loop-nest is vectorized and mask_store belongs to the inner
8906          one.  */
8907       e = split_block (bb, last);
8908       bb_loop = bb->loop_father;
8909       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8910       join_bb = e->dest;
8911       store_bb = create_empty_bb (bb);
8912       add_bb_to_loop (store_bb, bb_loop);
8913       e->flags = EDGE_TRUE_VALUE;
8914       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8915       /* Put STORE_BB to likely part.  */
8916       efalse->probability = profile_probability::unlikely ();
8917       store_bb->count = efalse->count ();
8918       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8919       if (dom_info_available_p (CDI_DOMINATORS))
8920         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8921       if (dump_enabled_p ())
8922         dump_printf_loc (MSG_NOTE, vect_location,
8923                          "Create new block %d to sink mask stores.",
8924                          store_bb->index);
8925       /* Create vector comparison with boolean result.  */
8926       vectype = TREE_TYPE (mask);
8927       zero = build_zero_cst (vectype);
8928       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8929       gsi = gsi_last_bb (bb);
8930       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8931       /* Create new PHI node for vdef of the last masked store:
8932          .MEM_2 = VDEF <.MEM_1>
8933          will be converted to
8934          .MEM.3 = VDEF <.MEM_1>
8935          and new PHI node will be created in join bb
8936          .MEM_2 = PHI <.MEM_1, .MEM_3>
8937       */
8938       vdef = gimple_vdef (last);
8939       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8940       gimple_set_vdef (last, new_vdef);
8941       phi = create_phi_node (vdef, join_bb);
8942       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8943
8944       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8945       while (true)
8946         {
8947           gimple_stmt_iterator gsi_from;
8948           gimple *stmt1 = NULL;
8949
8950           /* Move masked store to STORE_BB.  */
8951           last_store = last;
8952           gsi = gsi_for_stmt (last);
8953           gsi_from = gsi;
8954           /* Shift GSI to the previous stmt for further traversal.  */
8955           gsi_prev (&gsi);
8956           gsi_to = gsi_start_bb (store_bb);
8957           gsi_move_before (&gsi_from, &gsi_to);
8958           /* Setup GSI_TO to the non-empty block start.  */
8959           gsi_to = gsi_start_bb (store_bb);
8960           if (dump_enabled_p ())
8961             {
8962               dump_printf_loc (MSG_NOTE, vect_location,
8963                                "Move stmt to created bb\n");
8964               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8965             }
8966           /* Move all stored value producers if possible.  */
8967           while (!gsi_end_p (gsi))
8968             {
8969               tree lhs;
8970               imm_use_iterator imm_iter;
8971               use_operand_p use_p;
8972               bool res;
8973
8974               /* Skip debug statements.  */
8975               if (is_gimple_debug (gsi_stmt (gsi)))
8976                 {
8977                   gsi_prev (&gsi);
8978                   continue;
8979                 }
8980               stmt1 = gsi_stmt (gsi);
8981               /* Do not consider statements writing to memory or having
8982                  volatile operand.  */
8983               if (gimple_vdef (stmt1)
8984                   || gimple_has_volatile_ops (stmt1))
8985                 break;
8986               gsi_from = gsi;
8987               gsi_prev (&gsi);
8988               lhs = gimple_get_lhs (stmt1);
8989               if (!lhs)
8990                 break;
8991
8992               /* LHS of vectorized stmt must be SSA_NAME.  */
8993               if (TREE_CODE (lhs) != SSA_NAME)
8994                 break;
8995
8996               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8997                 {
8998                   /* Remove dead scalar statement.  */
8999                   if (has_zero_uses (lhs))
9000                     {
9001                       gsi_remove (&gsi_from, true);
9002                       continue;
9003                     }
9004                 }
9005
9006               /* Check that LHS does not have uses outside of STORE_BB.  */
9007               res = true;
9008               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9009                 {
9010                   gimple *use_stmt;
9011                   use_stmt = USE_STMT (use_p);
9012                   if (is_gimple_debug (use_stmt))
9013                     continue;
9014                   if (gimple_bb (use_stmt) != store_bb)
9015                     {
9016                       res = false;
9017                       break;
9018                     }
9019                 }
9020               if (!res)
9021                 break;
9022
9023               if (gimple_vuse (stmt1)
9024                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9025                 break;
9026
9027               /* Can move STMT1 to STORE_BB.  */
9028               if (dump_enabled_p ())
9029                 {
9030                   dump_printf_loc (MSG_NOTE, vect_location,
9031                                    "Move stmt to created bb\n");
9032                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9033                 }
9034               gsi_move_before (&gsi_from, &gsi_to);
9035               /* Shift GSI_TO for further insertion.  */
9036               gsi_prev (&gsi_to);
9037             }
9038           /* Put other masked stores with the same mask to STORE_BB.  */
9039           if (worklist.is_empty ()
9040               || gimple_call_arg (worklist.last (), 2) != mask
9041               || worklist.last () != stmt1)
9042             break;
9043           last = worklist.pop ();
9044         }
9045       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9046     }
9047 }