gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   if (dump_enabled_p ())
 217     {
 218       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 219       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 220     }
 221   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 222     return false;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 228
 229       /* If a pattern statement has def stmts, analyze them too.  */
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             {
 237               dump_printf_loc (MSG_NOTE, vect_location,
 238                                "==> examining pattern def stmt: ");
 239               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 240                                 def_stmt_info->stmt, 0);
 241             }
 242           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 243                                              vf, mask_producers))
 244             return false;
 245         }
 246
 247       if (dump_enabled_p ())
 248         {
 249           dump_printf_loc (MSG_NOTE, vect_location,
 250                            "==> examining pattern statement: ");
 251           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 252         }
 253       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 254         return false;
 255     }
 256
 257   return true;
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static bool
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297   auto_vec<stmt_vec_info> mask_producers;
 298
 299   if (dump_enabled_p ())
 300     dump_printf_loc (MSG_NOTE, vect_location,
 301                      "=== vect_determine_vectorization_factor ===\n");
 302
 303   for (i = 0; i < nbbs; i++)
 304     {
 305       basic_block bb = bbs[i];
 306
 307       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 308            gsi_next (&si))
 309         {
 310           phi = si.phi ();
 311           stmt_info = vinfo_for_stmt (phi);
 312           if (dump_enabled_p ())
 313             {
 314               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 315               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 316             }
 317
 318           gcc_assert (stmt_info);
 319
 320           if (STMT_VINFO_RELEVANT_P (stmt_info)
 321               || STMT_VINFO_LIVE_P (stmt_info))
 322             {
 323               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 324               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 325
 326               if (dump_enabled_p ())
 327                 {
 328                   dump_printf_loc (MSG_NOTE, vect_location,
 329                                    "get vectype for scalar type:  ");
 330                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 331                   dump_printf (MSG_NOTE, "\n");
 332                 }
 333
 334               vectype = get_vectype_for_scalar_type (scalar_type);
 335               if (!vectype)
 336                 {
 337                   if (dump_enabled_p ())
 338                     {
 339                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 340                                        "not vectorized: unsupported "
 341                                        "data-type ");
 342                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 343                                          scalar_type);
 344                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 345                     }
 346                   return false;
 347                 }
 348               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 349
 350               if (dump_enabled_p ())
 351                 {
 352                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 353                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 354                   dump_printf (MSG_NOTE, "\n");
 355                 }
 356
 357               if (dump_enabled_p ())
 358                 {
 359                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 360                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 361                   dump_printf (MSG_NOTE, "\n");
 362                 }
 363
 364               vect_update_max_nunits (&vectorization_factor, vectype);
 365             }
 366         }
 367
 368       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 369            gsi_next (&si))
 370         {
 371           stmt_info = vinfo_for_stmt (gsi_stmt (si));
 372           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 373                                            &mask_producers))
 374             return false;
 375         }
 376     }
 377
 378   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 379   if (dump_enabled_p ())
 380     {
 381       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 382       dump_dec (MSG_NOTE, vectorization_factor);
 383       dump_printf (MSG_NOTE, "\n");
 384     }
 385
 386   if (known_le (vectorization_factor, 1U))
 387     {
 388       if (dump_enabled_p ())
 389         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 390                          "not vectorized: unsupported data-type\n");
 391       return false;
 392     }
 393   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 394
 395   for (i = 0; i < mask_producers.length (); i++)
 396     {
 397       stmt_info = mask_producers[i];
 398       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 399       if (!mask_type)
 400         return false;
 401       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 402     }
 403
 404   return true;
 405 }
 406
 407
 408 /* Function vect_is_simple_iv_evolution.
 409
 410    FORNOW: A simple evolution of an induction variables in the loop is
 411    considered a polynomial evolution.  */
 412
 413 static bool
 414 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 415                              tree * step)
 416 {
 417   tree init_expr;
 418   tree step_expr;
 419   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 420   basic_block bb;
 421
 422   /* When there is no evolution in this loop, the evolution function
 423      is not "simple".  */
 424   if (evolution_part == NULL_TREE)
 425     return false;
 426
 427   /* When the evolution is a polynomial of degree >= 2
 428      the evolution function is not "simple".  */
 429   if (tree_is_chrec (evolution_part))
 430     return false;
 431
 432   step_expr = evolution_part;
 433   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 434
 435   if (dump_enabled_p ())
 436     {
 437       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 438       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 439       dump_printf (MSG_NOTE, ",  init: ");
 440       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 441       dump_printf (MSG_NOTE, "\n");
 442     }
 443
 444   *init = init_expr;
 445   *step = step_expr;
 446
 447   if (TREE_CODE (step_expr) != INTEGER_CST
 448       && (TREE_CODE (step_expr) != SSA_NAME
 449           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 450               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 451           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 452               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 453                   || !flag_associative_math)))
 454       && (TREE_CODE (step_expr) != REAL_CST
 455           || !flag_associative_math))
 456     {
 457       if (dump_enabled_p ())
 458         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 459                          "step unknown.\n");
 460       return false;
 461     }
 462
 463   return true;
 464 }
 465
 466 /* Function vect_analyze_scalar_cycles_1.
 467
 468    Examine the cross iteration def-use cycles of scalar variables
 469    in LOOP.  LOOP_VINFO represents the loop that is now being
 470    considered for vectorization (can be LOOP, or an outer-loop
 471    enclosing LOOP).  */
 472
 473 static void
 474 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 475 {
 476   basic_block bb = loop->header;
 477   tree init, step;
 478   auto_vec<gimple *, 64> worklist;
 479   gphi_iterator gsi;
 480   bool double_reduc;
 481
 482   if (dump_enabled_p ())
 483     dump_printf_loc (MSG_NOTE, vect_location,
 484                      "=== vect_analyze_scalar_cycles ===\n");
 485
 486   /* First - identify all inductions.  Reduction detection assumes that all the
 487      inductions have been identified, therefore, this order must not be
 488      changed.  */
 489   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 490     {
 491       gphi *phi = gsi.phi ();
 492       tree access_fn = NULL;
 493       tree def = PHI_RESULT (phi);
 494       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 495
 496       if (dump_enabled_p ())
 497         {
 498           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 499           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 500         }
 501
 502       /* Skip virtual phi's.  The data dependences that are associated with
 503          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 504       if (virtual_operand_p (def))
 505         continue;
 506
 507       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 508
 509       /* Analyze the evolution function.  */
 510       access_fn = analyze_scalar_evolution (loop, def);
 511       if (access_fn)
 512         {
 513           STRIP_NOPS (access_fn);
 514           if (dump_enabled_p ())
 515             {
 516               dump_printf_loc (MSG_NOTE, vect_location,
 517                                "Access function of PHI: ");
 518               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 519               dump_printf (MSG_NOTE, "\n");
 520             }
 521           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522             = initial_condition_in_loop_num (access_fn, loop->num);
 523           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 524             = evolution_part_in_loop_num (access_fn, loop->num);
 525         }
 526
 527       if (!access_fn
 528           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 529           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 530               && TREE_CODE (step) != INTEGER_CST))
 531         {
 532           worklist.safe_push (phi);
 533           continue;
 534         }
 535
 536       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 537                   != NULL_TREE);
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 539
 540       if (dump_enabled_p ())
 541         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 542       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 543     }
 544
 545
 546   /* Second - identify all reductions and nested cycles.  */
 547   while (worklist.length () > 0)
 548     {
 549       gimple *phi = worklist.pop ();
 550       tree def = PHI_RESULT (phi);
 551       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 552       gimple *reduc_stmt;
 553
 554       if (dump_enabled_p ())
 555         {
 556           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 557           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 558         }
 559
 560       gcc_assert (!virtual_operand_p (def)
 561                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 562
 563       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 564                                                 &double_reduc, false);
 565       if (reduc_stmt)
 566         {
 567           if (double_reduc)
 568             {
 569               if (dump_enabled_p ())
 570                 dump_printf_loc (MSG_NOTE, vect_location,
 571                                  "Detected double reduction.\n");
 572
 573               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 574               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 575                                                     vect_double_reduction_def;
 576             }
 577           else
 578             {
 579               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 580                 {
 581                   if (dump_enabled_p ())
 582                     dump_printf_loc (MSG_NOTE, vect_location,
 583                                      "Detected vectorizable nested cycle.\n");
 584
 585                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 586                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 587                                                              vect_nested_cycle;
 588                 }
 589               else
 590                 {
 591                   if (dump_enabled_p ())
 592                     dump_printf_loc (MSG_NOTE, vect_location,
 593                                      "Detected reduction.\n");
 594
 595                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 596                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 597                                                            vect_reduction_def;
 598                   /* Store the reduction cycles for possible vectorization in
 599                      loop-aware SLP if it was not detected as reduction
 600                      chain.  */
 601                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 602                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 603                 }
 604             }
 605         }
 606       else
 607         if (dump_enabled_p ())
 608           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 609                            "Unknown def-use cycle pattern.\n");
 610     }
 611 }
 612
 613
 614 /* Function vect_analyze_scalar_cycles.
 615
 616    Examine the cross iteration def-use cycles of scalar variables, by
 617    analyzing the loop-header PHIs of scalar variables.  Classify each
 618    cycle as one of the following: invariant, induction, reduction, unknown.
 619    We do that for the loop represented by LOOP_VINFO, and also to its
 620    inner-loop, if exists.
 621    Examples for scalar cycles:
 622
 623    Example1: reduction:
 624
 625               loop1:
 626               for (i=0; i<N; i++)
 627                  sum += a[i];
 628
 629    Example2: induction:
 630
 631               loop2:
 632               for (i=0; i<N; i++)
 633                  a[i] = i;  */
 634
 635 static void
 636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 637 {
 638   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 639
 640   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 641
 642   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 643      Reductions in such inner-loop therefore have different properties than
 644      the reductions in the nest that gets vectorized:
 645      1. When vectorized, they are executed in the same order as in the original
 646         scalar loop, so we can't change the order of computation when
 647         vectorizing them.
 648      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 649         current checks are too strict.  */
 650
 651   if (loop->inner)
 652     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 653 }
 654
 655 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 656
 657 static void
 658 vect_fixup_reduc_chain (gimple *stmt)
 659 {
 660   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 661   gimple *stmtp;
 662   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 663               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 664   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 665   do
 666     {
 667       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 668       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 669       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 670       if (stmt)
 671         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 672           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 673     }
 674   while (stmt);
 675   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 676 }
 677
 678 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 679
 680 static void
 681 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 682 {
 683   gimple *first;
 684   unsigned i;
 685
 686   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 687     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 688       {
 689         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 690         while (next)
 691           {
 692             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 693               break;
 694             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 695           }
 696         /* If not all stmt in the chain are patterns try to handle
 697            the chain without patterns.  */
 698         if (! next)
 699           {
 700             vect_fixup_reduc_chain (first);
 701             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 702               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 703           }
 704       }
 705 }
 706
 707 /* Function vect_get_loop_niters.
 708
 709    Determine how many iterations the loop is executed and place it
 710    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 711    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 712    niter information holds in ASSUMPTIONS.
 713
 714    Return the loop exit condition.  */
 715
 716
 717 static gcond *
 718 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 719                       tree *number_of_iterations, tree *number_of_iterationsm1)
 720 {
 721   edge exit = single_exit (loop);
 722   struct tree_niter_desc niter_desc;
 723   tree niter_assumptions, niter, may_be_zero;
 724   gcond *cond = get_loop_exit_condition (loop);
 725
 726   *assumptions = boolean_true_node;
 727   *number_of_iterationsm1 = chrec_dont_know;
 728   *number_of_iterations = chrec_dont_know;
 729   if (dump_enabled_p ())
 730     dump_printf_loc (MSG_NOTE, vect_location,
 731                      "=== get_loop_niters ===\n");
 732
 733   if (!exit)
 734     return cond;
 735
 736   niter = chrec_dont_know;
 737   may_be_zero = NULL_TREE;
 738   niter_assumptions = boolean_true_node;
 739   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 740       || chrec_contains_undetermined (niter_desc.niter))
 741     return cond;
 742
 743   niter_assumptions = niter_desc.assumptions;
 744   may_be_zero = niter_desc.may_be_zero;
 745   niter = niter_desc.niter;
 746
 747   if (may_be_zero && integer_zerop (may_be_zero))
 748     may_be_zero = NULL_TREE;
 749
 750   if (may_be_zero)
 751     {
 752       if (COMPARISON_CLASS_P (may_be_zero))
 753         {
 754           /* Try to combine may_be_zero with assumptions, this can simplify
 755              computation of niter expression.  */
 756           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 757             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 758                                              niter_assumptions,
 759                                              fold_build1 (TRUTH_NOT_EXPR,
 760                                                           boolean_type_node,
 761                                                           may_be_zero));
 762           else
 763             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 764                                  build_int_cst (TREE_TYPE (niter), 0),
 765                                  rewrite_to_non_trapping_overflow (niter));
 766
 767           may_be_zero = NULL_TREE;
 768         }
 769       else if (integer_nonzerop (may_be_zero))
 770         {
 771           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 772           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 773           return cond;
 774         }
 775       else
 776         return cond;
 777     }
 778
 779   *assumptions = niter_assumptions;
 780   *number_of_iterationsm1 = niter;
 781
 782   /* We want the number of loop header executions which is the number
 783      of latch executions plus one.
 784      ???  For UINT_MAX latch executions this number overflows to zero
 785      for loops like do { n++; } while (n != 0);  */
 786   if (niter && !chrec_contains_undetermined (niter))
 787     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 788                           build_int_cst (TREE_TYPE (niter), 1));
 789   *number_of_iterations = niter;
 790
 791   return cond;
 792 }
 793
 794 /* Function bb_in_loop_p
 795
 796    Used as predicate for dfs order traversal of the loop bbs.  */
 797
 798 static bool
 799 bb_in_loop_p (const_basic_block bb, const void *data)
 800 {
 801   const struct loop *const loop = (const struct loop *)data;
 802   if (flow_bb_inside_loop_p (loop, bb))
 803     return true;
 804   return false;
 805 }
 806
 807
 808 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 809    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 810
 811 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
 812   : vec_info (vec_info::loop, init_cost (loop_in)),
 813     loop (loop_in),
 814     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 815     num_itersm1 (NULL_TREE),
 816     num_iters (NULL_TREE),
 817     num_iters_unchanged (NULL_TREE),
 818     num_iters_assumptions (NULL_TREE),
 819     th (0),
 820     versioning_threshold (0),
 821     vectorization_factor (0),
 822     max_vectorization_factor (0),
 823     mask_skip_niters (NULL_TREE),
 824     mask_compare_type (NULL_TREE),
 825     unaligned_dr (NULL),
 826     peeling_for_alignment (0),
 827     ptr_mask (0),
 828     ivexpr_map (NULL),
 829     slp_unrolling_factor (1),
 830     single_scalar_iteration_cost (0),
 831     vectorizable (false),
 832     can_fully_mask_p (true),
 833     fully_masked_p (false),
 834     peeling_for_gaps (false),
 835     peeling_for_niter (false),
 836     operands_swapped (false),
 837     no_data_dependencies (false),
 838     has_mask_store (false),
 839     scalar_loop (NULL),
 840     orig_loop_info (NULL)
 841 {
 842   /* Create/Update stmt_info for all stmts in the loop.  */
 843   basic_block *body = get_loop_body (loop);
 844   for (unsigned int i = 0; i < loop->num_nodes; i++)
 845     {
 846       basic_block bb = body[i];
 847       gimple_stmt_iterator si;
 848
 849       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 850         {
 851           gimple *phi = gsi_stmt (si);
 852           gimple_set_uid (phi, 0);
 853           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
 854         }
 855
 856       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 857         {
 858           gimple *stmt = gsi_stmt (si);
 859           gimple_set_uid (stmt, 0);
 860           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
 861         }
 862     }
 863   free (body);
 864
 865   /* CHECKME: We want to visit all BBs before their successors (except for
 866      latch blocks, for which this assertion wouldn't hold).  In the simple
 867      case of the loop forms we allow, a dfs order of the BBs would the same
 868      as reversed postorder traversal, so we are safe.  */
 869
 870   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 871                                           bbs, loop->num_nodes, loop);
 872   gcc_assert (nbbs == loop->num_nodes);
 873 }
 874
 875 /* Free all levels of MASKS.  */
 876
 877 void
 878 release_vec_loop_masks (vec_loop_masks *masks)
 879 {
 880   rgroup_masks *rgm;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (*masks, i, rgm)
 883     rgm->masks.release ();
 884   masks->release ();
 885 }
 886
 887 /* Free all memory used by the _loop_vec_info, as well as all the
 888    stmt_vec_info structs of all the stmts in the loop.  */
 889
 890 _loop_vec_info::~_loop_vec_info ()
 891 {
 892   int nbbs;
 893   gimple_stmt_iterator si;
 894   int j;
 895
 896   nbbs = loop->num_nodes;
 897   for (j = 0; j < nbbs; j++)
 898     {
 899       basic_block bb = bbs[j];
 900       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 901         free_stmt_vec_info (gsi_stmt (si));
 902
 903       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 904         {
 905           gimple *stmt = gsi_stmt (si);
 906
 907           /* We may have broken canonical form by moving a constant
 908              into RHS1 of a commutative op.  Fix such occurrences.  */
 909           if (operands_swapped && is_gimple_assign (stmt))
 910             {
 911               enum tree_code code = gimple_assign_rhs_code (stmt);
 912
 913               if ((code == PLUS_EXPR
 914                    || code == POINTER_PLUS_EXPR
 915                    || code == MULT_EXPR)
 916                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 917                 swap_ssa_operands (stmt,
 918                                    gimple_assign_rhs1_ptr (stmt),
 919                                    gimple_assign_rhs2_ptr (stmt));
 920               else if (code == COND_EXPR
 921                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 922                 {
 923                   tree cond_expr = gimple_assign_rhs1 (stmt);
 924                   enum tree_code cond_code = TREE_CODE (cond_expr);
 925
 926                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 927                     {
 928                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 929                                                                   0));
 930                       cond_code = invert_tree_comparison (cond_code,
 931                                                           honor_nans);
 932                       if (cond_code != ERROR_MARK)
 933                         {
 934                           TREE_SET_CODE (cond_expr, cond_code);
 935                           swap_ssa_operands (stmt,
 936                                              gimple_assign_rhs2_ptr (stmt),
 937                                              gimple_assign_rhs3_ptr (stmt));
 938                         }
 939                     }
 940                 }
 941             }
 942
 943           /* Free stmt_vec_info.  */
 944           free_stmt_vec_info (stmt);
 945           gsi_next (&si);
 946         }
 947     }
 948
 949   free (bbs);
 950
 951   release_vec_loop_masks (&masks);
 952   delete ivexpr_map;
 953
 954   loop->aux = NULL;
 955 }
 956
 957 /* Return an invariant or register for EXPR and emit necessary
 958    computations in the LOOP_VINFO loop preheader.  */
 959
 960 tree
 961 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 962 {
 963   if (is_gimple_reg (expr)
 964       || is_gimple_min_invariant (expr))
 965     return expr;
 966
 967   if (! loop_vinfo->ivexpr_map)
 968     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 969   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 970   if (! cached)
 971     {
 972       gimple_seq stmts = NULL;
 973       cached = force_gimple_operand (unshare_expr (expr),
 974                                      &stmts, true, NULL_TREE);
 975       if (stmts)
 976         {
 977           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 978           gsi_insert_seq_on_edge_immediate (e, stmts);
 979         }
 980     }
 981   return cached;
 982 }
 983
 984 /* Return true if we can use CMP_TYPE as the comparison type to produce
 985    all masks required to mask LOOP_VINFO.  */
 986
 987 static bool
 988 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 989 {
 990   rgroup_masks *rgm;
 991   unsigned int i;
 992   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 993     if (rgm->mask_type != NULL_TREE
 994         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 995                                             cmp_type, rgm->mask_type,
 996                                             OPTIMIZE_FOR_SPEED))
 997       return false;
 998   return true;
 999 }
1000
1001 /* Calculate the maximum number of scalars per iteration for every
1002    rgroup in LOOP_VINFO.  */
1003
1004 static unsigned int
1005 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1006 {
1007   unsigned int res = 1;
1008   unsigned int i;
1009   rgroup_masks *rgm;
1010   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1011     res = MAX (res, rgm->max_nscalars_per_iter);
1012   return res;
1013 }
1014
1015 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1016    whether we can actually generate the masks required.  Return true if so,
1017    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1018
1019 static bool
1020 vect_verify_full_masking (loop_vec_info loop_vinfo)
1021 {
1022   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1023   unsigned int min_ni_width;
1024
1025   /* Use a normal loop if there are no statements that need masking.
1026      This only happens in rare degenerate cases: it means that the loop
1027      has no loads, no stores, and no live-out values.  */
1028   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1029     return false;
1030
1031   /* Get the maximum number of iterations that is representable
1032      in the counter type.  */
1033   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1034   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1035
1036   /* Get a more refined estimate for the number of iterations.  */
1037   widest_int max_back_edges;
1038   if (max_loop_iterations (loop, &max_back_edges))
1039     max_ni = wi::smin (max_ni, max_back_edges + 1);
1040
1041   /* Account for rgroup masks, in which each bit is replicated N times.  */
1042   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1043
1044   /* Work out how many bits we need to represent the limit.  */
1045   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1046
1047   /* Find a scalar mode for which WHILE_ULT is supported.  */
1048   opt_scalar_int_mode cmp_mode_iter;
1049   tree cmp_type = NULL_TREE;
1050   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1051     {
1052       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1053       if (cmp_bits >= min_ni_width
1054           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1055         {
1056           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1057           if (this_type
1058               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1059             {
1060               /* Although we could stop as soon as we find a valid mode,
1061                  it's often better to continue until we hit Pmode, since the
1062                  operands to the WHILE are more likely to be reusable in
1063                  address calculations.  */
1064               cmp_type = this_type;
1065               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1066                 break;
1067             }
1068         }
1069     }
1070
1071   if (!cmp_type)
1072     return false;
1073
1074   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1075   return true;
1076 }
1077
1078 /* Calculate the cost of one scalar iteration of the loop.  */
1079 static void
1080 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1081 {
1082   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1083   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1084   int nbbs = loop->num_nodes, factor;
1085   int innerloop_iters, i;
1086
1087   /* Gather costs for statements in the scalar loop.  */
1088
1089   /* FORNOW.  */
1090   innerloop_iters = 1;
1091   if (loop->inner)
1092     innerloop_iters = 50; /* FIXME */
1093
1094   for (i = 0; i < nbbs; i++)
1095     {
1096       gimple_stmt_iterator si;
1097       basic_block bb = bbs[i];
1098
1099       if (bb->loop_father == loop->inner)
1100         factor = innerloop_iters;
1101       else
1102         factor = 1;
1103
1104       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1105         {
1106           gimple *stmt = gsi_stmt (si);
1107           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1108
1109           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1110             continue;
1111
1112           /* Skip stmts that are not vectorized inside the loop.  */
1113           if (stmt_info
1114               && !STMT_VINFO_RELEVANT_P (stmt_info)
1115               && (!STMT_VINFO_LIVE_P (stmt_info)
1116                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1117               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1118             continue;
1119
1120           vect_cost_for_stmt kind;
1121           if (STMT_VINFO_DATA_REF (stmt_info))
1122             {
1123               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1124                kind = scalar_load;
1125              else
1126                kind = scalar_store;
1127             }
1128           else
1129             kind = scalar_stmt;
1130
1131           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1132                             factor, kind, stmt_info, 0, vect_prologue);
1133         }
1134     }
1135
1136   /* Now accumulate cost.  */
1137   void *target_cost_data = init_cost (loop);
1138   stmt_info_for_cost *si;
1139   int j;
1140   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1141                     j, si)
1142     {
1143       struct _stmt_vec_info *stmt_info
1144         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1145       (void) add_stmt_cost (target_cost_data, si->count,
1146                             si->kind, stmt_info, si->misalign,
1147                             vect_body);
1148     }
1149   unsigned dummy, body_cost = 0;
1150   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1151   destroy_cost_data (target_cost_data);
1152   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1153 }
1154
1155
1156 /* Function vect_analyze_loop_form_1.
1157
1158    Verify that certain CFG restrictions hold, including:
1159    - the loop has a pre-header
1160    - the loop has a single entry and exit
1161    - the loop exit condition is simple enough
1162    - the number of iterations can be analyzed, i.e, a countable loop.  The
1163      niter could be analyzed under some assumptions.  */
1164
1165 bool
1166 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1167                           tree *assumptions, tree *number_of_iterationsm1,
1168                           tree *number_of_iterations, gcond **inner_loop_cond)
1169 {
1170   if (dump_enabled_p ())
1171     dump_printf_loc (MSG_NOTE, vect_location,
1172                      "=== vect_analyze_loop_form ===\n");
1173
1174   /* Different restrictions apply when we are considering an inner-most loop,
1175      vs. an outer (nested) loop.
1176      (FORNOW. May want to relax some of these restrictions in the future).  */
1177
1178   if (!loop->inner)
1179     {
1180       /* Inner-most loop.  We currently require that the number of BBs is
1181          exactly 2 (the header and latch).  Vectorizable inner-most loops
1182          look like this:
1183
1184                         (pre-header)
1185                            |
1186                           header <--------+
1187                            | |            |
1188                            | +--> latch --+
1189                            |
1190                         (exit-bb)  */
1191
1192       if (loop->num_nodes != 2)
1193         {
1194           if (dump_enabled_p ())
1195             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196                              "not vectorized: control flow in loop.\n");
1197           return false;
1198         }
1199
1200       if (empty_block_p (loop->header))
1201         {
1202           if (dump_enabled_p ())
1203             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1204                              "not vectorized: empty loop.\n");
1205           return false;
1206         }
1207     }
1208   else
1209     {
1210       struct loop *innerloop = loop->inner;
1211       edge entryedge;
1212
1213       /* Nested loop. We currently require that the loop is doubly-nested,
1214          contains a single inner loop, and the number of BBs is exactly 5.
1215          Vectorizable outer-loops look like this:
1216
1217                         (pre-header)
1218                            |
1219                           header <---+
1220                            |         |
1221                           inner-loop |
1222                            |         |
1223                           tail ------+
1224                            |
1225                         (exit-bb)
1226
1227          The inner-loop has the properties expected of inner-most loops
1228          as described above.  */
1229
1230       if ((loop->inner)->inner || (loop->inner)->next)
1231         {
1232           if (dump_enabled_p ())
1233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234                              "not vectorized: multiple nested loops.\n");
1235           return false;
1236         }
1237
1238       if (loop->num_nodes != 5)
1239         {
1240           if (dump_enabled_p ())
1241             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1242                              "not vectorized: control flow in loop.\n");
1243           return false;
1244         }
1245
1246       entryedge = loop_preheader_edge (innerloop);
1247       if (entryedge->src != loop->header
1248           || !single_exit (innerloop)
1249           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1250         {
1251           if (dump_enabled_p ())
1252             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1253                              "not vectorized: unsupported outerloop form.\n");
1254           return false;
1255         }
1256
1257       /* Analyze the inner-loop.  */
1258       tree inner_niterm1, inner_niter, inner_assumptions;
1259       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1260                                       &inner_assumptions, &inner_niterm1,
1261                                       &inner_niter, NULL)
1262           /* Don't support analyzing niter under assumptions for inner
1263              loop.  */
1264           || !integer_onep (inner_assumptions))
1265         {
1266           if (dump_enabled_p ())
1267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1268                              "not vectorized: Bad inner loop.\n");
1269           return false;
1270         }
1271
1272       if (!expr_invariant_in_loop_p (loop, inner_niter))
1273         {
1274           if (dump_enabled_p ())
1275             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276                              "not vectorized: inner-loop count not"
1277                              " invariant.\n");
1278           return false;
1279         }
1280
1281       if (dump_enabled_p ())
1282         dump_printf_loc (MSG_NOTE, vect_location,
1283                          "Considering outer-loop vectorization.\n");
1284     }
1285
1286   if (!single_exit (loop)
1287       || EDGE_COUNT (loop->header->preds) != 2)
1288     {
1289       if (dump_enabled_p ())
1290         {
1291           if (!single_exit (loop))
1292             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293                              "not vectorized: multiple exits.\n");
1294           else if (EDGE_COUNT (loop->header->preds) != 2)
1295             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1296                              "not vectorized: too many incoming edges.\n");
1297         }
1298       return false;
1299     }
1300
1301   /* We assume that the loop exit condition is at the end of the loop. i.e,
1302      that the loop is represented as a do-while (with a proper if-guard
1303      before the loop if needed), where the loop header contains all the
1304      executable statements, and the latch is empty.  */
1305   if (!empty_block_p (loop->latch)
1306       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1307     {
1308       if (dump_enabled_p ())
1309         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310                          "not vectorized: latch block not empty.\n");
1311       return false;
1312     }
1313
1314   /* Make sure the exit is not abnormal.  */
1315   edge e = single_exit (loop);
1316   if (e->flags & EDGE_ABNORMAL)
1317     {
1318       if (dump_enabled_p ())
1319         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1320                          "not vectorized: abnormal loop exit edge.\n");
1321       return false;
1322     }
1323
1324   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1325                                      number_of_iterationsm1);
1326   if (!*loop_cond)
1327     {
1328       if (dump_enabled_p ())
1329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330                          "not vectorized: complicated exit condition.\n");
1331       return false;
1332     }
1333
1334   if (integer_zerop (*assumptions)
1335       || !*number_of_iterations
1336       || chrec_contains_undetermined (*number_of_iterations))
1337     {
1338       if (dump_enabled_p ())
1339         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340                          "not vectorized: number of iterations cannot be "
1341                          "computed.\n");
1342       return false;
1343     }
1344
1345   if (integer_zerop (*number_of_iterations))
1346     {
1347       if (dump_enabled_p ())
1348         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1349                          "not vectorized: number of iterations = 0.\n");
1350       return false;
1351     }
1352
1353   return true;
1354 }
1355
1356 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1357
1358 loop_vec_info
1359 vect_analyze_loop_form (struct loop *loop)
1360 {
1361   tree assumptions, number_of_iterations, number_of_iterationsm1;
1362   gcond *loop_cond, *inner_loop_cond = NULL;
1363
1364   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1365                                   &assumptions, &number_of_iterationsm1,
1366                                   &number_of_iterations, &inner_loop_cond))
1367     return NULL;
1368
1369   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1370   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1371   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1372   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1373   if (!integer_onep (assumptions))
1374     {
1375       /* We consider to vectorize this loop by versioning it under
1376          some assumptions.  In order to do this, we need to clear
1377          existing information computed by scev and niter analyzer.  */
1378       scev_reset_htab ();
1379       free_numbers_of_iterations_estimates (loop);
1380       /* Also set flag for this loop so that following scev and niter
1381          analysis are done under the assumptions.  */
1382       loop_constraint_set (loop, LOOP_C_FINITE);
1383       /* Also record the assumptions for versioning.  */
1384       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1385     }
1386
1387   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1388     {
1389       if (dump_enabled_p ())
1390         {
1391           dump_printf_loc (MSG_NOTE, vect_location,
1392                            "Symbolic number of iterations is ");
1393           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1394           dump_printf (MSG_NOTE, "\n");
1395         }
1396     }
1397
1398   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1399   if (inner_loop_cond)
1400     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1401       = loop_exit_ctrl_vec_info_type;
1402
1403   gcc_assert (!loop->aux);
1404   loop->aux = loop_vinfo;
1405   return loop_vinfo;
1406 }
1407
1408
1409
1410 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1411    statements update the vectorization factor.  */
1412
1413 static void
1414 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1415 {
1416   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1417   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1418   int nbbs = loop->num_nodes;
1419   poly_uint64 vectorization_factor;
1420   int i;
1421
1422   if (dump_enabled_p ())
1423     dump_printf_loc (MSG_NOTE, vect_location,
1424                      "=== vect_update_vf_for_slp ===\n");
1425
1426   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1427   gcc_assert (known_ne (vectorization_factor, 0U));
1428
1429   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1430      vectorization factor of the loop is the unrolling factor required by
1431      the SLP instances.  If that unrolling factor is 1, we say, that we
1432      perform pure SLP on loop - cross iteration parallelism is not
1433      exploited.  */
1434   bool only_slp_in_loop = true;
1435   for (i = 0; i < nbbs; i++)
1436     {
1437       basic_block bb = bbs[i];
1438       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1439            gsi_next (&si))
1440         {
1441           gimple *stmt = gsi_stmt (si);
1442           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1443           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1444               && STMT_VINFO_RELATED_STMT (stmt_info))
1445             {
1446               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1447               stmt_info = vinfo_for_stmt (stmt);
1448             }
1449           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1450                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1451               && !PURE_SLP_STMT (stmt_info))
1452             /* STMT needs both SLP and loop-based vectorization.  */
1453             only_slp_in_loop = false;
1454         }
1455     }
1456
1457   if (only_slp_in_loop)
1458     {
1459       dump_printf_loc (MSG_NOTE, vect_location,
1460                        "Loop contains only SLP stmts\n");
1461       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1462     }
1463   else
1464     {
1465       dump_printf_loc (MSG_NOTE, vect_location,
1466                        "Loop contains SLP and non-SLP stmts\n");
1467       /* Both the vectorization factor and unroll factor have the form
1468          current_vector_size * X for some rational X, so they must have
1469          a common multiple.  */
1470       vectorization_factor
1471         = force_common_multiple (vectorization_factor,
1472                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1473     }
1474
1475   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1476   if (dump_enabled_p ())
1477     {
1478       dump_printf_loc (MSG_NOTE, vect_location,
1479                        "Updating vectorization factor to ");
1480       dump_dec (MSG_NOTE, vectorization_factor);
1481       dump_printf (MSG_NOTE, ".\n");
1482     }
1483 }
1484
1485 /* Return true if STMT_INFO describes a double reduction phi and if
1486    the other phi in the reduction is also relevant for vectorization.
1487    This rejects cases such as:
1488
1489       outer1:
1490         x_1 = PHI <x_3(outer2), ...>;
1491         ...
1492
1493       inner:
1494         x_2 = ...;
1495         ...
1496
1497       outer2:
1498         x_3 = PHI <x_2(inner)>;
1499
1500    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1501
1502 static bool
1503 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1504 {
1505   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1506     return false;
1507
1508   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1509   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1510 }
1511
1512 /* Function vect_analyze_loop_operations.
1513
1514    Scan the loop stmts and make sure they are all vectorizable.  */
1515
1516 static bool
1517 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1518 {
1519   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1520   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1521   int nbbs = loop->num_nodes;
1522   int i;
1523   stmt_vec_info stmt_info;
1524   bool need_to_vectorize = false;
1525   bool ok;
1526
1527   if (dump_enabled_p ())
1528     dump_printf_loc (MSG_NOTE, vect_location,
1529                      "=== vect_analyze_loop_operations ===\n");
1530
1531   stmt_vector_for_cost cost_vec;
1532   cost_vec.create (2);
1533
1534   for (i = 0; i < nbbs; i++)
1535     {
1536       basic_block bb = bbs[i];
1537
1538       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1539            gsi_next (&si))
1540         {
1541           gphi *phi = si.phi ();
1542           ok = true;
1543
1544           stmt_info = vinfo_for_stmt (phi);
1545           if (dump_enabled_p ())
1546             {
1547               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1548               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1549             }
1550           if (virtual_operand_p (gimple_phi_result (phi)))
1551             continue;
1552
1553           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1554              (i.e., a phi in the tail of the outer-loop).  */
1555           if (! is_loop_header_bb_p (bb))
1556             {
1557               /* FORNOW: we currently don't support the case that these phis
1558                  are not used in the outerloop (unless it is double reduction,
1559                  i.e., this phi is vect_reduction_def), cause this case
1560                  requires to actually do something here.  */
1561               if (STMT_VINFO_LIVE_P (stmt_info)
1562                   && !vect_active_double_reduction_p (stmt_info))
1563                 {
1564                   if (dump_enabled_p ())
1565                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566                                      "Unsupported loop-closed phi in "
1567                                      "outer-loop.\n");
1568                   return false;
1569                 }
1570
1571               /* If PHI is used in the outer loop, we check that its operand
1572                  is defined in the inner loop.  */
1573               if (STMT_VINFO_RELEVANT_P (stmt_info))
1574                 {
1575                   tree phi_op;
1576                   gimple *op_def_stmt;
1577
1578                   if (gimple_phi_num_args (phi) != 1)
1579                     return false;
1580
1581                   phi_op = PHI_ARG_DEF (phi, 0);
1582                   if (TREE_CODE (phi_op) != SSA_NAME)
1583                     return false;
1584
1585                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1586                   if (gimple_nop_p (op_def_stmt)
1587                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1588                       || !vinfo_for_stmt (op_def_stmt))
1589                     return false;
1590
1591                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1592                         != vect_used_in_outer
1593                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1594                            != vect_used_in_outer_by_reduction)
1595                     return false;
1596                 }
1597
1598               continue;
1599             }
1600
1601           gcc_assert (stmt_info);
1602
1603           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1604                || STMT_VINFO_LIVE_P (stmt_info))
1605               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1606             {
1607               /* A scalar-dependence cycle that we don't support.  */
1608               if (dump_enabled_p ())
1609                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1610                                  "not vectorized: scalar dependence cycle.\n");
1611               return false;
1612             }
1613
1614           if (STMT_VINFO_RELEVANT_P (stmt_info))
1615             {
1616               need_to_vectorize = true;
1617               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1618                   && ! PURE_SLP_STMT (stmt_info))
1619                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1620               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1621                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1622                        && ! PURE_SLP_STMT (stmt_info))
1623                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1624                                              &cost_vec);
1625             }
1626
1627           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1628           if (ok
1629               && STMT_VINFO_LIVE_P (stmt_info)
1630               && !PURE_SLP_STMT (stmt_info))
1631             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1632                                               &cost_vec);
1633
1634           if (!ok)
1635             {
1636               if (dump_enabled_p ())
1637                 {
1638                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639                                    "not vectorized: relevant phi not "
1640                                    "supported: ");
1641                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1642                 }
1643               return false;
1644             }
1645         }
1646
1647       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1648            gsi_next (&si))
1649         {
1650           gimple *stmt = gsi_stmt (si);
1651           if (!gimple_clobber_p (stmt)
1652               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1653                                      &cost_vec))
1654             return false;
1655         }
1656     } /* bbs */
1657
1658   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1659   cost_vec.release ();
1660
1661   /* All operations in the loop are either irrelevant (deal with loop
1662      control, or dead), or only used outside the loop and can be moved
1663      out of the loop (e.g. invariants, inductions).  The loop can be
1664      optimized away by scalar optimizations.  We're better off not
1665      touching this loop.  */
1666   if (!need_to_vectorize)
1667     {
1668       if (dump_enabled_p ())
1669         dump_printf_loc (MSG_NOTE, vect_location,
1670                          "All the computation can be taken out of the loop.\n");
1671       if (dump_enabled_p ())
1672         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673                          "not vectorized: redundant loop. no profit to "
1674                          "vectorize.\n");
1675       return false;
1676     }
1677
1678   return true;
1679 }
1680
1681 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1682    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1683    definitely no, or -1 if it's worth retrying.  */
1684
1685 static int
1686 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1687 {
1688   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1689   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1690
1691   /* Only fully-masked loops can have iteration counts less than the
1692      vectorization factor.  */
1693   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1694     {
1695       HOST_WIDE_INT max_niter;
1696
1697       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1698         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1699       else
1700         max_niter = max_stmt_executions_int (loop);
1701
1702       if (max_niter != -1
1703           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1704         {
1705           if (dump_enabled_p ())
1706             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707                              "not vectorized: iteration count smaller than "
1708                              "vectorization factor.\n");
1709           return 0;
1710         }
1711     }
1712
1713   int min_profitable_iters, min_profitable_estimate;
1714   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1715                                       &min_profitable_estimate);
1716
1717   if (min_profitable_iters < 0)
1718     {
1719       if (dump_enabled_p ())
1720         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1721                          "not vectorized: vectorization not profitable.\n");
1722       if (dump_enabled_p ())
1723         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724                          "not vectorized: vector version will never be "
1725                          "profitable.\n");
1726       return -1;
1727     }
1728
1729   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1730                                * assumed_vf);
1731
1732   /* Use the cost model only if it is more conservative than user specified
1733      threshold.  */
1734   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1735                                     min_profitable_iters);
1736
1737   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1738
1739   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1740       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1741     {
1742       if (dump_enabled_p ())
1743         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744                          "not vectorized: vectorization not profitable.\n");
1745       if (dump_enabled_p ())
1746         dump_printf_loc (MSG_NOTE, vect_location,
1747                          "not vectorized: iteration count smaller than user "
1748                          "specified loop bound parameter or minimum profitable "
1749                          "iterations (whichever is more conservative).\n");
1750       return 0;
1751     }
1752
1753   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1754   if (estimated_niter == -1)
1755     estimated_niter = likely_max_stmt_executions_int (loop);
1756   if (estimated_niter != -1
1757       && ((unsigned HOST_WIDE_INT) estimated_niter
1758           < MAX (th, (unsigned) min_profitable_estimate)))
1759     {
1760       if (dump_enabled_p ())
1761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1762                          "not vectorized: estimated iteration count too "
1763                          "small.\n");
1764       if (dump_enabled_p ())
1765         dump_printf_loc (MSG_NOTE, vect_location,
1766                          "not vectorized: estimated iteration count smaller "
1767                          "than specified loop bound parameter or minimum "
1768                          "profitable iterations (whichever is more "
1769                          "conservative).\n");
1770       return -1;
1771     }
1772
1773   return 1;
1774 }
1775
1776
1777 /* Function vect_analyze_loop_2.
1778
1779    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1780    for it.  The different analyses will record information in the
1781    loop_vec_info struct.  */
1782 static bool
1783 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1784 {
1785   bool ok;
1786   int res;
1787   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1788   poly_uint64 min_vf = 2;
1789   unsigned int n_stmts = 0;
1790
1791   /* The first group of checks is independent of the vector size.  */
1792   fatal = true;
1793
1794   /* Find all data references in the loop (which correspond to vdefs/vuses)
1795      and analyze their evolution in the loop.  */
1796
1797   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1798
1799   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1800   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1801     {
1802       if (dump_enabled_p ())
1803         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1804                          "not vectorized: loop nest containing two "
1805                          "or more consecutive inner loops cannot be "
1806                          "vectorized\n");
1807       return false;
1808     }
1809
1810   for (unsigned i = 0; i < loop->num_nodes; i++)
1811     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1812          !gsi_end_p (gsi); gsi_next (&gsi))
1813       {
1814         gimple *stmt = gsi_stmt (gsi);
1815         if (is_gimple_debug (stmt))
1816           continue;
1817         ++n_stmts;
1818         if (!find_data_references_in_stmt (loop, stmt,
1819                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1820           {
1821             if (is_gimple_call (stmt) && loop->safelen)
1822               {
1823                 tree fndecl = gimple_call_fndecl (stmt), op;
1824                 if (fndecl != NULL_TREE)
1825                   {
1826                     cgraph_node *node = cgraph_node::get (fndecl);
1827                     if (node != NULL && node->simd_clones != NULL)
1828                       {
1829                         unsigned int j, n = gimple_call_num_args (stmt);
1830                         for (j = 0; j < n; j++)
1831                           {
1832                             op = gimple_call_arg (stmt, j);
1833                             if (DECL_P (op)
1834                                 || (REFERENCE_CLASS_P (op)
1835                                     && get_base_address (op)))
1836                               break;
1837                           }
1838                         op = gimple_call_lhs (stmt);
1839                         /* Ignore #pragma omp declare simd functions
1840                            if they don't have data references in the
1841                            call stmt itself.  */
1842                         if (j == n
1843                             && !(op
1844                                  && (DECL_P (op)
1845                                      || (REFERENCE_CLASS_P (op)
1846                                          && get_base_address (op)))))
1847                           continue;
1848                       }
1849                   }
1850               }
1851             if (dump_enabled_p ())
1852               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853                                "not vectorized: loop contains function "
1854                                "calls or data references that cannot "
1855                                "be analyzed\n");
1856             return false;
1857           }
1858       }
1859
1860   /* Analyze the data references and also adjust the minimal
1861      vectorization factor according to the loads and stores.  */
1862
1863   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1864   if (!ok)
1865     {
1866       if (dump_enabled_p ())
1867         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868                          "bad data references.\n");
1869       return false;
1870     }
1871
1872   /* Classify all cross-iteration scalar data-flow cycles.
1873      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1874   vect_analyze_scalar_cycles (loop_vinfo);
1875
1876   vect_pattern_recog (loop_vinfo);
1877
1878   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1879
1880   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1881      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1882
1883   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1884   if (!ok)
1885     {
1886       if (dump_enabled_p ())
1887         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888                          "bad data access.\n");
1889       return false;
1890     }
1891
1892   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1893
1894   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1895   if (!ok)
1896     {
1897       if (dump_enabled_p ())
1898         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                          "unexpected pattern.\n");
1900       return false;
1901     }
1902
1903   /* While the rest of the analysis below depends on it in some way.  */
1904   fatal = false;
1905
1906   /* Analyze data dependences between the data-refs in the loop
1907      and adjust the maximum vectorization factor according to
1908      the dependences.
1909      FORNOW: fail at the first data dependence that we encounter.  */
1910
1911   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1912   if (!ok
1913       || (max_vf != MAX_VECTORIZATION_FACTOR
1914           && maybe_lt (max_vf, min_vf)))
1915     {
1916       if (dump_enabled_p ())
1917             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1918                              "bad data dependence.\n");
1919       return false;
1920     }
1921   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1922
1923   ok = vect_determine_vectorization_factor (loop_vinfo);
1924   if (!ok)
1925     {
1926       if (dump_enabled_p ())
1927         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1928                          "can't determine vectorization factor.\n");
1929       return false;
1930     }
1931   if (max_vf != MAX_VECTORIZATION_FACTOR
1932       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1933     {
1934       if (dump_enabled_p ())
1935         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936                          "bad data dependence.\n");
1937       return false;
1938     }
1939
1940   /* Compute the scalar iteration cost.  */
1941   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1942
1943   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1944   unsigned th;
1945
1946   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1947   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1948   if (!ok)
1949     return false;
1950
1951   /* If there are any SLP instances mark them as pure_slp.  */
1952   bool slp = vect_make_slp_decision (loop_vinfo);
1953   if (slp)
1954     {
1955       /* Find stmts that need to be both vectorized and SLPed.  */
1956       vect_detect_hybrid_slp (loop_vinfo);
1957
1958       /* Update the vectorization factor based on the SLP decision.  */
1959       vect_update_vf_for_slp (loop_vinfo);
1960     }
1961
1962   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1963
1964   /* We don't expect to have to roll back to anything other than an empty
1965      set of rgroups.  */
1966   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1967
1968   /* This is the point where we can re-start analysis with SLP forced off.  */
1969 start_over:
1970
1971   /* Now the vectorization factor is final.  */
1972   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1973   gcc_assert (known_ne (vectorization_factor, 0U));
1974
1975   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1976     {
1977       dump_printf_loc (MSG_NOTE, vect_location,
1978                        "vectorization_factor = ");
1979       dump_dec (MSG_NOTE, vectorization_factor);
1980       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1981                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1982     }
1983
1984   HOST_WIDE_INT max_niter
1985     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1986
1987   /* Analyze the alignment of the data-refs in the loop.
1988      Fail if a data reference is found that cannot be vectorized.  */
1989
1990   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1991   if (!ok)
1992     {
1993       if (dump_enabled_p ())
1994         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1995                          "bad data alignment.\n");
1996       return false;
1997     }
1998
1999   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2000      It is important to call pruning after vect_analyze_data_ref_accesses,
2001      since we use grouping information gathered by interleaving analysis.  */
2002   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2003   if (!ok)
2004     return false;
2005
2006   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2007      vectorization.  */
2008   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2009     {
2010     /* This pass will decide on using loop versioning and/or loop peeling in
2011        order to enhance the alignment of data references in the loop.  */
2012     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2013     if (!ok)
2014       {
2015         if (dump_enabled_p ())
2016           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2017                            "bad data alignment.\n");
2018         return false;
2019       }
2020     }
2021
2022   if (slp)
2023     {
2024       /* Analyze operations in the SLP instances.  Note this may
2025          remove unsupported SLP instances which makes the above
2026          SLP kind detection invalid.  */
2027       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2028       vect_slp_analyze_operations (loop_vinfo);
2029       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2030         goto again;
2031     }
2032
2033   /* Scan all the remaining operations in the loop that are not subject
2034      to SLP and make sure they are vectorizable.  */
2035   ok = vect_analyze_loop_operations (loop_vinfo);
2036   if (!ok)
2037     {
2038       if (dump_enabled_p ())
2039         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040                          "bad operation or unsupported loop bound.\n");
2041       return false;
2042     }
2043
2044   /* Decide whether to use a fully-masked loop for this vectorization
2045      factor.  */
2046   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2047     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2048        && vect_verify_full_masking (loop_vinfo));
2049   if (dump_enabled_p ())
2050     {
2051       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2052         dump_printf_loc (MSG_NOTE, vect_location,
2053                          "using a fully-masked loop.\n");
2054       else
2055         dump_printf_loc (MSG_NOTE, vect_location,
2056                          "not using a fully-masked loop.\n");
2057     }
2058
2059   /* If epilog loop is required because of data accesses with gaps,
2060      one additional iteration needs to be peeled.  Check if there is
2061      enough iterations for vectorization.  */
2062   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2063       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2064       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2065     {
2066       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2067       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2068
2069       if (known_lt (wi::to_widest (scalar_niters), vf))
2070         {
2071           if (dump_enabled_p ())
2072             dump_printf_loc (MSG_NOTE, vect_location,
2073                              "loop has no enough iterations to support"
2074                              " peeling for gaps.\n");
2075           return false;
2076         }
2077     }
2078
2079   /* Check the costings of the loop make vectorizing worthwhile.  */
2080   res = vect_analyze_loop_costing (loop_vinfo);
2081   if (res < 0)
2082     goto again;
2083   if (!res)
2084     {
2085       if (dump_enabled_p ())
2086         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2087                          "Loop costings not worthwhile.\n");
2088       return false;
2089     }
2090
2091   /* Decide whether we need to create an epilogue loop to handle
2092      remaining scalar iterations.  */
2093   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2094
2095   unsigned HOST_WIDE_INT const_vf;
2096   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2097     /* The main loop handles all iterations.  */
2098     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2099   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2100            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2101     {
2102       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2103                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2104                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2105         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2106     }
2107   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2108            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2109            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2110                 < (unsigned) exact_log2 (const_vf))
2111                /* In case of versioning, check if the maximum number of
2112                   iterations is greater than th.  If they are identical,
2113                   the epilogue is unnecessary.  */
2114                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2115                    || ((unsigned HOST_WIDE_INT) max_niter
2116                        > (th / const_vf) * const_vf))))
2117     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2118
2119   /* If an epilogue loop is required make sure we can create one.  */
2120   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2121       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2122     {
2123       if (dump_enabled_p ())
2124         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2125       if (!vect_can_advance_ivs_p (loop_vinfo)
2126           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2127                                            single_exit (LOOP_VINFO_LOOP
2128                                                          (loop_vinfo))))
2129         {
2130           if (dump_enabled_p ())
2131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2132                              "not vectorized: can't create required "
2133                              "epilog loop\n");
2134           goto again;
2135         }
2136     }
2137
2138   /* During peeling, we need to check if number of loop iterations is
2139      enough for both peeled prolog loop and vector loop.  This check
2140      can be merged along with threshold check of loop versioning, so
2141      increase threshold for this case if necessary.  */
2142   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2143     {
2144       poly_uint64 niters_th = 0;
2145
2146       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2147         {
2148           /* Niters for peeled prolog loop.  */
2149           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2150             {
2151               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2152               tree vectype
2153                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2154               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2155             }
2156           else
2157             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2158         }
2159
2160       /* Niters for at least one iteration of vectorized loop.  */
2161       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2162         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2163       /* One additional iteration because of peeling for gap.  */
2164       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2165         niters_th += 1;
2166       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2167     }
2168
2169   gcc_assert (known_eq (vectorization_factor,
2170                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2171
2172   /* Ok to vectorize!  */
2173   return true;
2174
2175 again:
2176   /* Try again with SLP forced off but if we didn't do any SLP there is
2177      no point in re-trying.  */
2178   if (!slp)
2179     return false;
2180
2181   /* If there are reduction chains re-trying will fail anyway.  */
2182   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2183     return false;
2184
2185   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2186      via interleaving or lane instructions.  */
2187   slp_instance instance;
2188   slp_tree node;
2189   unsigned i, j;
2190   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2191     {
2192       stmt_vec_info vinfo;
2193       vinfo = vinfo_for_stmt
2194           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2195       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2196         continue;
2197       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2198       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2199       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2200       if (! vect_store_lanes_supported (vectype, size, false)
2201          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2202          && ! vect_grouped_store_supported (vectype, size))
2203        return false;
2204       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2205         {
2206           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2207           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2208           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2209           size = STMT_VINFO_GROUP_SIZE (vinfo);
2210           vectype = STMT_VINFO_VECTYPE (vinfo);
2211           if (! vect_load_lanes_supported (vectype, size, false)
2212               && ! vect_grouped_load_supported (vectype, single_element_p,
2213                                                 size))
2214             return false;
2215         }
2216     }
2217
2218   if (dump_enabled_p ())
2219     dump_printf_loc (MSG_NOTE, vect_location,
2220                      "re-trying with SLP disabled\n");
2221
2222   /* Roll back state appropriately.  No SLP this time.  */
2223   slp = false;
2224   /* Restore vectorization factor as it were without SLP.  */
2225   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2226   /* Free the SLP instances.  */
2227   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2228     vect_free_slp_instance (instance);
2229   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2230   /* Reset SLP type to loop_vect on all stmts.  */
2231   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2232     {
2233       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2234       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2235            !gsi_end_p (si); gsi_next (&si))
2236         {
2237           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2238           STMT_SLP_TYPE (stmt_info) = loop_vect;
2239         }
2240       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2241            !gsi_end_p (si); gsi_next (&si))
2242         {
2243           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2244           STMT_SLP_TYPE (stmt_info) = loop_vect;
2245           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2246             {
2247               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2248               STMT_SLP_TYPE (stmt_info) = loop_vect;
2249               for (gimple_stmt_iterator pi
2250                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2251                    !gsi_end_p (pi); gsi_next (&pi))
2252                 {
2253                   gimple *pstmt = gsi_stmt (pi);
2254                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2255                 }
2256             }
2257         }
2258     }
2259   /* Free optimized alias test DDRS.  */
2260   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2261   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2262   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2263   /* Reset target cost data.  */
2264   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2265   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2266     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2267   /* Reset accumulated rgroup information.  */
2268   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2269   /* Reset assorted flags.  */
2270   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2271   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2272   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2273   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2274   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2275
2276   goto start_over;
2277 }
2278
2279 /* Function vect_analyze_loop.
2280
2281    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2282    for it.  The different analyses will record information in the
2283    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2284    be vectorized.  */
2285 loop_vec_info
2286 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2287 {
2288   loop_vec_info loop_vinfo;
2289   auto_vector_sizes vector_sizes;
2290
2291   /* Autodetect first vector size we try.  */
2292   current_vector_size = 0;
2293   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2294   unsigned int next_size = 0;
2295
2296   if (dump_enabled_p ())
2297     dump_printf_loc (MSG_NOTE, vect_location,
2298                      "===== analyze_loop_nest =====\n");
2299
2300   if (loop_outer (loop)
2301       && loop_vec_info_for_loop (loop_outer (loop))
2302       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2303     {
2304       if (dump_enabled_p ())
2305         dump_printf_loc (MSG_NOTE, vect_location,
2306                          "outer-loop already vectorized.\n");
2307       return NULL;
2308     }
2309
2310   poly_uint64 autodetected_vector_size = 0;
2311   while (1)
2312     {
2313       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2314       loop_vinfo = vect_analyze_loop_form (loop);
2315       if (!loop_vinfo)
2316         {
2317           if (dump_enabled_p ())
2318             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2319                              "bad loop form.\n");
2320           return NULL;
2321         }
2322
2323       bool fatal = false;
2324
2325       if (orig_loop_vinfo)
2326         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2327
2328       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2329         {
2330           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2331
2332           return loop_vinfo;
2333         }
2334
2335       delete loop_vinfo;
2336
2337       if (next_size == 0)
2338         autodetected_vector_size = current_vector_size;
2339
2340       if (next_size < vector_sizes.length ()
2341           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2342         next_size += 1;
2343
2344       if (fatal
2345           || next_size == vector_sizes.length ()
2346           || known_eq (current_vector_size, 0U))
2347         return NULL;
2348
2349       /* Try the next biggest vector size.  */
2350       current_vector_size = vector_sizes[next_size++];
2351       if (dump_enabled_p ())
2352         {
2353           dump_printf_loc (MSG_NOTE, vect_location,
2354                            "***** Re-trying analysis with "
2355                            "vector size ");
2356           dump_dec (MSG_NOTE, current_vector_size);
2357           dump_printf (MSG_NOTE, "\n");
2358         }
2359     }
2360 }
2361
2362 /* Return true if there is an in-order reduction function for CODE, storing
2363    it in *REDUC_FN if so.  */
2364
2365 static bool
2366 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2367 {
2368   switch (code)
2369     {
2370     case PLUS_EXPR:
2371       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2372       return true;
2373
2374     default:
2375       return false;
2376     }
2377 }
2378
2379 /* Function reduction_fn_for_scalar_code
2380
2381    Input:
2382    CODE - tree_code of a reduction operations.
2383
2384    Output:
2385    REDUC_FN - the corresponding internal function to be used to reduce the
2386       vector of partial results into a single scalar result, or IFN_LAST
2387       if the operation is a supported reduction operation, but does not have
2388       such an internal function.
2389
2390    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2391
2392 static bool
2393 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2394 {
2395   switch (code)
2396     {
2397       case MAX_EXPR:
2398         *reduc_fn = IFN_REDUC_MAX;
2399         return true;
2400
2401       case MIN_EXPR:
2402         *reduc_fn = IFN_REDUC_MIN;
2403         return true;
2404
2405       case PLUS_EXPR:
2406         *reduc_fn = IFN_REDUC_PLUS;
2407         return true;
2408
2409       case BIT_AND_EXPR:
2410         *reduc_fn = IFN_REDUC_AND;
2411         return true;
2412
2413       case BIT_IOR_EXPR:
2414         *reduc_fn = IFN_REDUC_IOR;
2415         return true;
2416
2417       case BIT_XOR_EXPR:
2418         *reduc_fn = IFN_REDUC_XOR;
2419         return true;
2420
2421       case MULT_EXPR:
2422       case MINUS_EXPR:
2423         *reduc_fn = IFN_LAST;
2424         return true;
2425
2426       default:
2427        return false;
2428     }
2429 }
2430
2431 /* If there is a neutral value X such that SLP reduction NODE would not
2432    be affected by the introduction of additional X elements, return that X,
2433    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2434    is true if the SLP statements perform a single reduction, false if each
2435    statement performs an independent reduction.  */
2436
2437 static tree
2438 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2439                               bool reduc_chain)
2440 {
2441   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2442   gimple *stmt = stmts[0];
2443   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2444   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2445   tree scalar_type = TREE_TYPE (vector_type);
2446   struct loop *loop = gimple_bb (stmt)->loop_father;
2447   gcc_assert (loop);
2448
2449   switch (code)
2450     {
2451     case WIDEN_SUM_EXPR:
2452     case DOT_PROD_EXPR:
2453     case SAD_EXPR:
2454     case PLUS_EXPR:
2455     case MINUS_EXPR:
2456     case BIT_IOR_EXPR:
2457     case BIT_XOR_EXPR:
2458       return build_zero_cst (scalar_type);
2459
2460     case MULT_EXPR:
2461       return build_one_cst (scalar_type);
2462
2463     case BIT_AND_EXPR:
2464       return build_all_ones_cst (scalar_type);
2465
2466     case MAX_EXPR:
2467     case MIN_EXPR:
2468       /* For MIN/MAX the initial values are neutral.  A reduction chain
2469          has only a single initial value, so that value is neutral for
2470          all statements.  */
2471       if (reduc_chain)
2472         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2473       return NULL_TREE;
2474
2475     default:
2476       return NULL_TREE;
2477     }
2478 }
2479
2480 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2481    STMT is printed with a message MSG. */
2482
2483 static void
2484 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2485 {
2486   dump_printf_loc (msg_type, vect_location, "%s", msg);
2487   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2488 }
2489
2490
2491 /* Detect SLP reduction of the form:
2492
2493    #a1 = phi <a5, a0>
2494    a2 = operation (a1)
2495    a3 = operation (a2)
2496    a4 = operation (a3)
2497    a5 = operation (a4)
2498
2499    #a = phi <a5>
2500
2501    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2502    FIRST_STMT is the first reduction stmt in the chain
2503    (a2 = operation (a1)).
2504
2505    Return TRUE if a reduction chain was detected.  */
2506
2507 static bool
2508 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2509                        gimple *first_stmt)
2510 {
2511   struct loop *loop = (gimple_bb (phi))->loop_father;
2512   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2513   enum tree_code code;
2514   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2515   stmt_vec_info use_stmt_info, current_stmt_info;
2516   tree lhs;
2517   imm_use_iterator imm_iter;
2518   use_operand_p use_p;
2519   int nloop_uses, size = 0, n_out_of_loop_uses;
2520   bool found = false;
2521
2522   if (loop != vect_loop)
2523     return false;
2524
2525   lhs = PHI_RESULT (phi);
2526   code = gimple_assign_rhs_code (first_stmt);
2527   while (1)
2528     {
2529       nloop_uses = 0;
2530       n_out_of_loop_uses = 0;
2531       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2532         {
2533           gimple *use_stmt = USE_STMT (use_p);
2534           if (is_gimple_debug (use_stmt))
2535             continue;
2536
2537           /* Check if we got back to the reduction phi.  */
2538           if (use_stmt == phi)
2539             {
2540               loop_use_stmt = use_stmt;
2541               found = true;
2542               break;
2543             }
2544
2545           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2546             {
2547               loop_use_stmt = use_stmt;
2548               nloop_uses++;
2549             }
2550            else
2551              n_out_of_loop_uses++;
2552
2553            /* There are can be either a single use in the loop or two uses in
2554               phi nodes.  */
2555            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2556              return false;
2557         }
2558
2559       if (found)
2560         break;
2561
2562       /* We reached a statement with no loop uses.  */
2563       if (nloop_uses == 0)
2564         return false;
2565
2566       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2567       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2568         return false;
2569
2570       if (!is_gimple_assign (loop_use_stmt)
2571           || code != gimple_assign_rhs_code (loop_use_stmt)
2572           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2573         return false;
2574
2575       /* Insert USE_STMT into reduction chain.  */
2576       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2577       if (current_stmt)
2578         {
2579           current_stmt_info = vinfo_for_stmt (current_stmt);
2580           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2581           GROUP_FIRST_ELEMENT (use_stmt_info)
2582             = GROUP_FIRST_ELEMENT (current_stmt_info);
2583         }
2584       else
2585         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2586
2587       lhs = gimple_assign_lhs (loop_use_stmt);
2588       current_stmt = loop_use_stmt;
2589       size++;
2590    }
2591
2592   if (!found || loop_use_stmt != phi || size < 2)
2593     return false;
2594
2595   /* Swap the operands, if needed, to make the reduction operand be the second
2596      operand.  */
2597   lhs = PHI_RESULT (phi);
2598   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2599   while (next_stmt)
2600     {
2601       if (gimple_assign_rhs2 (next_stmt) == lhs)
2602         {
2603           tree op = gimple_assign_rhs1 (next_stmt);
2604           gimple *def_stmt = NULL;
2605
2606           if (TREE_CODE (op) == SSA_NAME)
2607             def_stmt = SSA_NAME_DEF_STMT (op);
2608
2609           /* Check that the other def is either defined in the loop
2610              ("vect_internal_def"), or it's an induction (defined by a
2611              loop-header phi-node).  */
2612           if (def_stmt
2613               && gimple_bb (def_stmt)
2614               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2615               && (is_gimple_assign (def_stmt)
2616                   || is_gimple_call (def_stmt)
2617                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2618                            == vect_induction_def
2619                   || (gimple_code (def_stmt) == GIMPLE_PHI
2620                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2621                                   == vect_internal_def
2622                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2623             {
2624               lhs = gimple_assign_lhs (next_stmt);
2625               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2626               continue;
2627             }
2628
2629           return false;
2630         }
2631       else
2632         {
2633           tree op = gimple_assign_rhs2 (next_stmt);
2634           gimple *def_stmt = NULL;
2635
2636           if (TREE_CODE (op) == SSA_NAME)
2637             def_stmt = SSA_NAME_DEF_STMT (op);
2638
2639           /* Check that the other def is either defined in the loop
2640             ("vect_internal_def"), or it's an induction (defined by a
2641             loop-header phi-node).  */
2642           if (def_stmt
2643               && gimple_bb (def_stmt)
2644               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2645               && (is_gimple_assign (def_stmt)
2646                   || is_gimple_call (def_stmt)
2647                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2648                               == vect_induction_def
2649                   || (gimple_code (def_stmt) == GIMPLE_PHI
2650                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2651                                   == vect_internal_def
2652                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2653             {
2654               if (dump_enabled_p ())
2655                 {
2656                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2657                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2658                 }
2659
2660               swap_ssa_operands (next_stmt,
2661                                  gimple_assign_rhs1_ptr (next_stmt),
2662                                  gimple_assign_rhs2_ptr (next_stmt));
2663               update_stmt (next_stmt);
2664
2665               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2666                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2667             }
2668           else
2669             return false;
2670         }
2671
2672       lhs = gimple_assign_lhs (next_stmt);
2673       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2674     }
2675
2676   /* Save the chain for further analysis in SLP detection.  */
2677   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2678   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2679   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2680
2681   return true;
2682 }
2683
2684 /* Return true if we need an in-order reduction for operation CODE
2685    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2686    overflow must wrap.  */
2687
2688 static bool
2689 needs_fold_left_reduction_p (tree type, tree_code code,
2690                              bool need_wrapping_integral_overflow)
2691 {
2692   /* CHECKME: check for !flag_finite_math_only too?  */
2693   if (SCALAR_FLOAT_TYPE_P (type))
2694     switch (code)
2695       {
2696       case MIN_EXPR:
2697       case MAX_EXPR:
2698         return false;
2699
2700       default:
2701         return !flag_associative_math;
2702       }
2703
2704   if (INTEGRAL_TYPE_P (type))
2705     {
2706       if (!operation_no_trapping_overflow (type, code))
2707         return true;
2708       if (need_wrapping_integral_overflow
2709           && !TYPE_OVERFLOW_WRAPS (type)
2710           && operation_can_overflow (code))
2711         return true;
2712       return false;
2713     }
2714
2715   if (SAT_FIXED_POINT_TYPE_P (type))
2716     return true;
2717
2718   return false;
2719 }
2720
2721 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2722    reduction operation CODE has a handled computation expression.  */
2723
2724 bool
2725 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2726                       enum tree_code code)
2727 {
2728   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2729   auto_bitmap visited;
2730   tree lookfor = PHI_RESULT (phi);
2731   ssa_op_iter curri;
2732   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2733   while (USE_FROM_PTR (curr) != loop_arg)
2734     curr = op_iter_next_use (&curri);
2735   curri.i = curri.numops;
2736   do
2737     {
2738       path.safe_push (std::make_pair (curri, curr));
2739       tree use = USE_FROM_PTR (curr);
2740       if (use == lookfor)
2741         break;
2742       gimple *def = SSA_NAME_DEF_STMT (use);
2743       if (gimple_nop_p (def)
2744           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2745         {
2746 pop:
2747           do
2748             {
2749               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2750               curri = x.first;
2751               curr = x.second;
2752               do
2753                 curr = op_iter_next_use (&curri);
2754               /* Skip already visited or non-SSA operands (from iterating
2755                  over PHI args).  */
2756               while (curr != NULL_USE_OPERAND_P
2757                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2758                          || ! bitmap_set_bit (visited,
2759                                               SSA_NAME_VERSION
2760                                                 (USE_FROM_PTR (curr)))));
2761             }
2762           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2763           if (curr == NULL_USE_OPERAND_P)
2764             break;
2765         }
2766       else
2767         {
2768           if (gimple_code (def) == GIMPLE_PHI)
2769             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2770           else
2771             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2772           while (curr != NULL_USE_OPERAND_P
2773                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2774                      || ! bitmap_set_bit (visited,
2775                                           SSA_NAME_VERSION
2776                                             (USE_FROM_PTR (curr)))))
2777             curr = op_iter_next_use (&curri);
2778           if (curr == NULL_USE_OPERAND_P)
2779             goto pop;
2780         }
2781     }
2782   while (1);
2783   if (dump_file && (dump_flags & TDF_DETAILS))
2784     {
2785       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2786       unsigned i;
2787       std::pair<ssa_op_iter, use_operand_p> *x;
2788       FOR_EACH_VEC_ELT (path, i, x)
2789         {
2790           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2791           dump_printf (MSG_NOTE, " ");
2792         }
2793       dump_printf (MSG_NOTE, "\n");
2794     }
2795
2796   /* Check whether the reduction path detected is valid.  */
2797   bool fail = path.length () == 0;
2798   bool neg = false;
2799   for (unsigned i = 1; i < path.length (); ++i)
2800     {
2801       gimple *use_stmt = USE_STMT (path[i].second);
2802       tree op = USE_FROM_PTR (path[i].second);
2803       if (! has_single_use (op)
2804           || ! is_gimple_assign (use_stmt))
2805         {
2806           fail = true;
2807           break;
2808         }
2809       if (gimple_assign_rhs_code (use_stmt) != code)
2810         {
2811           if (code == PLUS_EXPR
2812               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2813             {
2814               /* Track whether we negate the reduction value each iteration.  */
2815               if (gimple_assign_rhs2 (use_stmt) == op)
2816                 neg = ! neg;
2817             }
2818           else
2819             {
2820               fail = true;
2821               break;
2822             }
2823         }
2824     }
2825   return ! fail && ! neg;
2826 }
2827
2828
2829 /* Function vect_is_simple_reduction
2830
2831    (1) Detect a cross-iteration def-use cycle that represents a simple
2832    reduction computation.  We look for the following pattern:
2833
2834    loop_header:
2835      a1 = phi < a0, a2 >
2836      a3 = ...
2837      a2 = operation (a3, a1)
2838
2839    or
2840
2841    a3 = ...
2842    loop_header:
2843      a1 = phi < a0, a2 >
2844      a2 = operation (a3, a1)
2845
2846    such that:
2847    1. operation is commutative and associative and it is safe to
2848       change the order of the computation
2849    2. no uses for a2 in the loop (a2 is used out of the loop)
2850    3. no uses of a1 in the loop besides the reduction operation
2851    4. no uses of a1 outside the loop.
2852
2853    Conditions 1,4 are tested here.
2854    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2855
2856    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2857    nested cycles.
2858
2859    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2860    reductions:
2861
2862      a1 = phi < a0, a2 >
2863      inner loop (def of a3)
2864      a2 = phi < a3 >
2865
2866    (4) Detect condition expressions, ie:
2867      for (int i = 0; i < N; i++)
2868        if (a[i] < val)
2869         ret_val = a[i];
2870
2871 */
2872
2873 static gimple *
2874 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2875                           bool *double_reduc,
2876                           bool need_wrapping_integral_overflow,
2877                           enum vect_reduction_type *v_reduc_type)
2878 {
2879   struct loop *loop = (gimple_bb (phi))->loop_father;
2880   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2881   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2882   enum tree_code orig_code, code;
2883   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2884   tree type;
2885   int nloop_uses;
2886   tree name;
2887   imm_use_iterator imm_iter;
2888   use_operand_p use_p;
2889   bool phi_def;
2890
2891   *double_reduc = false;
2892   *v_reduc_type = TREE_CODE_REDUCTION;
2893
2894   tree phi_name = PHI_RESULT (phi);
2895   /* ???  If there are no uses of the PHI result the inner loop reduction
2896      won't be detected as possibly double-reduction by vectorizable_reduction
2897      because that tries to walk the PHI arg from the preheader edge which
2898      can be constant.  See PR60382.  */
2899   if (has_zero_uses (phi_name))
2900     return NULL;
2901   nloop_uses = 0;
2902   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2903     {
2904       gimple *use_stmt = USE_STMT (use_p);
2905       if (is_gimple_debug (use_stmt))
2906         continue;
2907
2908       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2909         {
2910           if (dump_enabled_p ())
2911             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912                              "intermediate value used outside loop.\n");
2913
2914           return NULL;
2915         }
2916
2917       nloop_uses++;
2918       if (nloop_uses > 1)
2919         {
2920           if (dump_enabled_p ())
2921             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2922                              "reduction value used in loop.\n");
2923           return NULL;
2924         }
2925
2926       phi_use_stmt = use_stmt;
2927     }
2928
2929   edge latch_e = loop_latch_edge (loop);
2930   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2931   if (TREE_CODE (loop_arg) != SSA_NAME)
2932     {
2933       if (dump_enabled_p ())
2934         {
2935           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2936                            "reduction: not ssa_name: ");
2937           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2938           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2939         }
2940       return NULL;
2941     }
2942
2943   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2944   if (is_gimple_assign (def_stmt))
2945     {
2946       name = gimple_assign_lhs (def_stmt);
2947       phi_def = false;
2948     }
2949   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2950     {
2951       name = PHI_RESULT (def_stmt);
2952       phi_def = true;
2953     }
2954   else
2955     {
2956       if (dump_enabled_p ())
2957         {
2958           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2959                            "reduction: unhandled reduction operation: ");
2960           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2961         }
2962       return NULL;
2963     }
2964
2965   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2966     return NULL;
2967
2968   nloop_uses = 0;
2969   auto_vec<gphi *, 3> lcphis;
2970   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2971     {
2972       gimple *use_stmt = USE_STMT (use_p);
2973       if (is_gimple_debug (use_stmt))
2974         continue;
2975       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2976         nloop_uses++;
2977       else
2978         /* We can have more than one loop-closed PHI.  */
2979         lcphis.safe_push (as_a <gphi *> (use_stmt));
2980       if (nloop_uses > 1)
2981         {
2982           if (dump_enabled_p ())
2983             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2984                              "reduction used in loop.\n");
2985           return NULL;
2986         }
2987     }
2988
2989   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2990      defined in the inner loop.  */
2991   if (phi_def)
2992     {
2993       op1 = PHI_ARG_DEF (def_stmt, 0);
2994
2995       if (gimple_phi_num_args (def_stmt) != 1
2996           || TREE_CODE (op1) != SSA_NAME)
2997         {
2998           if (dump_enabled_p ())
2999             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3000                              "unsupported phi node definition.\n");
3001
3002           return NULL;
3003         }
3004
3005       def1 = SSA_NAME_DEF_STMT (op1);
3006       if (gimple_bb (def1)
3007           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3008           && loop->inner
3009           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3010           && is_gimple_assign (def1)
3011           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3012         {
3013           if (dump_enabled_p ())
3014             report_vect_op (MSG_NOTE, def_stmt,
3015                             "detected double reduction: ");
3016
3017           *double_reduc = true;
3018           return def_stmt;
3019         }
3020
3021       return NULL;
3022     }
3023
3024   /* If we are vectorizing an inner reduction we are executing that
3025      in the original order only in case we are not dealing with a
3026      double reduction.  */
3027   bool check_reduction = true;
3028   if (flow_loop_nested_p (vect_loop, loop))
3029     {
3030       gphi *lcphi;
3031       unsigned i;
3032       check_reduction = false;
3033       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3034         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3035           {
3036             gimple *use_stmt = USE_STMT (use_p);
3037             if (is_gimple_debug (use_stmt))
3038               continue;
3039             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3040               check_reduction = true;
3041           }
3042     }
3043
3044   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3045   code = orig_code = gimple_assign_rhs_code (def_stmt);
3046
3047   /* We can handle "res -= x[i]", which is non-associative by
3048      simply rewriting this into "res += -x[i]".  Avoid changing
3049      gimple instruction for the first simple tests and only do this
3050      if we're allowed to change code at all.  */
3051   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3052     code = PLUS_EXPR;
3053
3054   if (code == COND_EXPR)
3055     {
3056       if (! nested_in_vect_loop)
3057         *v_reduc_type = COND_REDUCTION;
3058
3059       op3 = gimple_assign_rhs1 (def_stmt);
3060       if (COMPARISON_CLASS_P (op3))
3061         {
3062           op4 = TREE_OPERAND (op3, 1);
3063           op3 = TREE_OPERAND (op3, 0);
3064         }
3065       if (op3 == phi_name || op4 == phi_name)
3066         {
3067           if (dump_enabled_p ())
3068             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3069                             "reduction: condition depends on previous"
3070                             " iteration: ");
3071           return NULL;
3072         }
3073
3074       op1 = gimple_assign_rhs2 (def_stmt);
3075       op2 = gimple_assign_rhs3 (def_stmt);
3076     }
3077   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3078     {
3079       if (dump_enabled_p ())
3080         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3081                         "reduction: not commutative/associative: ");
3082       return NULL;
3083     }
3084   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3085     {
3086       op1 = gimple_assign_rhs1 (def_stmt);
3087       op2 = gimple_assign_rhs2 (def_stmt);
3088     }
3089   else
3090     {
3091       if (dump_enabled_p ())
3092         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3093                         "reduction: not handled operation: ");
3094       return NULL;
3095     }
3096
3097   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3098     {
3099       if (dump_enabled_p ())
3100         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3101                         "reduction: both uses not ssa_names: ");
3102
3103       return NULL;
3104     }
3105
3106   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3107   if ((TREE_CODE (op1) == SSA_NAME
3108        && !types_compatible_p (type,TREE_TYPE (op1)))
3109       || (TREE_CODE (op2) == SSA_NAME
3110           && !types_compatible_p (type, TREE_TYPE (op2)))
3111       || (op3 && TREE_CODE (op3) == SSA_NAME
3112           && !types_compatible_p (type, TREE_TYPE (op3)))
3113       || (op4 && TREE_CODE (op4) == SSA_NAME
3114           && !types_compatible_p (type, TREE_TYPE (op4))))
3115     {
3116       if (dump_enabled_p ())
3117         {
3118           dump_printf_loc (MSG_NOTE, vect_location,
3119                            "reduction: multiple types: operation type: ");
3120           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3121           dump_printf (MSG_NOTE, ", operands types: ");
3122           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3123                              TREE_TYPE (op1));
3124           dump_printf (MSG_NOTE, ",");
3125           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3126                              TREE_TYPE (op2));
3127           if (op3)
3128             {
3129               dump_printf (MSG_NOTE, ",");
3130               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3131                                  TREE_TYPE (op3));
3132             }
3133
3134           if (op4)
3135             {
3136               dump_printf (MSG_NOTE, ",");
3137               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3138                                  TREE_TYPE (op4));
3139             }
3140           dump_printf (MSG_NOTE, "\n");
3141         }
3142
3143       return NULL;
3144     }
3145
3146   /* Check whether it's ok to change the order of the computation.
3147      Generally, when vectorizing a reduction we change the order of the
3148      computation.  This may change the behavior of the program in some
3149      cases, so we need to check that this is ok.  One exception is when
3150      vectorizing an outer-loop: the inner-loop is executed sequentially,
3151      and therefore vectorizing reductions in the inner-loop during
3152      outer-loop vectorization is safe.  */
3153   if (check_reduction
3154       && *v_reduc_type == TREE_CODE_REDUCTION
3155       && needs_fold_left_reduction_p (type, code,
3156                                       need_wrapping_integral_overflow))
3157     *v_reduc_type = FOLD_LEFT_REDUCTION;
3158
3159   /* Reduction is safe. We're dealing with one of the following:
3160      1) integer arithmetic and no trapv
3161      2) floating point arithmetic, and special flags permit this optimization
3162      3) nested cycle (i.e., outer loop vectorization).  */
3163   if (TREE_CODE (op1) == SSA_NAME)
3164     def1 = SSA_NAME_DEF_STMT (op1);
3165
3166   if (TREE_CODE (op2) == SSA_NAME)
3167     def2 = SSA_NAME_DEF_STMT (op2);
3168
3169   if (code != COND_EXPR
3170       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3171     {
3172       if (dump_enabled_p ())
3173         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3174       return NULL;
3175     }
3176
3177   /* Check that one def is the reduction def, defined by PHI,
3178      the other def is either defined in the loop ("vect_internal_def"),
3179      or it's an induction (defined by a loop-header phi-node).  */
3180
3181   if (def2 && def2 == phi
3182       && (code == COND_EXPR
3183           || !def1 || gimple_nop_p (def1)
3184           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3185           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3186               && (is_gimple_assign (def1)
3187                   || is_gimple_call (def1)
3188                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3189                       == vect_induction_def
3190                   || (gimple_code (def1) == GIMPLE_PHI
3191                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3192                           == vect_internal_def
3193                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3194     {
3195       if (dump_enabled_p ())
3196         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3197       return def_stmt;
3198     }
3199
3200   if (def1 && def1 == phi
3201       && (code == COND_EXPR
3202           || !def2 || gimple_nop_p (def2)
3203           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3204           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3205               && (is_gimple_assign (def2)
3206                   || is_gimple_call (def2)
3207                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3208                        == vect_induction_def
3209                   || (gimple_code (def2) == GIMPLE_PHI
3210                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3211                            == vect_internal_def
3212                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3213     {
3214       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3215         {
3216           /* Check if we can swap operands (just for simplicity - so that
3217              the rest of the code can assume that the reduction variable
3218              is always the last (second) argument).  */
3219           if (code == COND_EXPR)
3220             {
3221               /* Swap cond_expr by inverting the condition.  */
3222               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3223               enum tree_code invert_code = ERROR_MARK;
3224               enum tree_code cond_code = TREE_CODE (cond_expr);
3225
3226               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3227                 {
3228                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3229                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3230                 }
3231               if (invert_code != ERROR_MARK)
3232                 {
3233                   TREE_SET_CODE (cond_expr, invert_code);
3234                   swap_ssa_operands (def_stmt,
3235                                      gimple_assign_rhs2_ptr (def_stmt),
3236                                      gimple_assign_rhs3_ptr (def_stmt));
3237                 }
3238               else
3239                 {
3240                   if (dump_enabled_p ())
3241                     report_vect_op (MSG_NOTE, def_stmt,
3242                                     "detected reduction: cannot swap operands "
3243                                     "for cond_expr");
3244                   return NULL;
3245                 }
3246             }
3247           else
3248             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3249                                gimple_assign_rhs2_ptr (def_stmt));
3250
3251           if (dump_enabled_p ())
3252             report_vect_op (MSG_NOTE, def_stmt,
3253                             "detected reduction: need to swap operands: ");
3254
3255           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3256             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3257         }
3258       else
3259         {
3260           if (dump_enabled_p ())
3261             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3262         }
3263
3264       return def_stmt;
3265     }
3266
3267   /* Try to find SLP reduction chain.  */
3268   if (! nested_in_vect_loop
3269       && code != COND_EXPR
3270       && orig_code != MINUS_EXPR
3271       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3272     {
3273       if (dump_enabled_p ())
3274         report_vect_op (MSG_NOTE, def_stmt,
3275                         "reduction: detected reduction chain: ");
3276
3277       return def_stmt;
3278     }
3279
3280   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3281   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3282   while (first)
3283     {
3284       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3285       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3286       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3287       first = next;
3288     }
3289
3290   /* Look for the expression computing loop_arg from loop PHI result.  */
3291   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3292                             code))
3293     return def_stmt;
3294
3295   if (dump_enabled_p ())
3296     {
3297       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3298                       "reduction: unknown pattern: ");
3299     }
3300
3301   return NULL;
3302 }
3303
3304 /* Wrapper around vect_is_simple_reduction, which will modify code
3305    in-place if it enables detection of more reductions.  Arguments
3306    as there.  */
3307
3308 gimple *
3309 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3310                              bool *double_reduc,
3311                              bool need_wrapping_integral_overflow)
3312 {
3313   enum vect_reduction_type v_reduc_type;
3314   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3315                                           need_wrapping_integral_overflow,
3316                                           &v_reduc_type);
3317   if (def)
3318     {
3319       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3320       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3321       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3322       reduc_def_info = vinfo_for_stmt (def);
3323       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3324       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3325     }
3326   return def;
3327 }
3328
3329 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3330 int
3331 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3332                              int *peel_iters_epilogue,
3333                              stmt_vector_for_cost *scalar_cost_vec,
3334                              stmt_vector_for_cost *prologue_cost_vec,
3335                              stmt_vector_for_cost *epilogue_cost_vec)
3336 {
3337   int retval = 0;
3338   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3339
3340   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3341     {
3342       *peel_iters_epilogue = assumed_vf / 2;
3343       if (dump_enabled_p ())
3344         dump_printf_loc (MSG_NOTE, vect_location,
3345                          "cost model: epilogue peel iters set to vf/2 "
3346                          "because loop iterations are unknown .\n");
3347
3348       /* If peeled iterations are known but number of scalar loop
3349          iterations are unknown, count a taken branch per peeled loop.  */
3350       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3351                                  NULL, 0, vect_prologue);
3352       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3353                                  NULL, 0, vect_epilogue);
3354     }
3355   else
3356     {
3357       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3358       peel_iters_prologue = niters < peel_iters_prologue ?
3359                             niters : peel_iters_prologue;
3360       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3361       /* If we need to peel for gaps, but no peeling is required, we have to
3362          peel VF iterations.  */
3363       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3364         *peel_iters_epilogue = assumed_vf;
3365     }
3366
3367   stmt_info_for_cost *si;
3368   int j;
3369   if (peel_iters_prologue)
3370     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3371         {
3372           stmt_vec_info stmt_info
3373             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3374           retval += record_stmt_cost (prologue_cost_vec,
3375                                       si->count * peel_iters_prologue,
3376                                       si->kind, stmt_info, si->misalign,
3377                                       vect_prologue);
3378         }
3379   if (*peel_iters_epilogue)
3380     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3381         {
3382           stmt_vec_info stmt_info
3383             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3384           retval += record_stmt_cost (epilogue_cost_vec,
3385                                       si->count * *peel_iters_epilogue,
3386                                       si->kind, stmt_info, si->misalign,
3387                                       vect_epilogue);
3388         }
3389
3390   return retval;
3391 }
3392
3393 /* Function vect_estimate_min_profitable_iters
3394
3395    Return the number of iterations required for the vector version of the
3396    loop to be profitable relative to the cost of the scalar version of the
3397    loop.
3398
3399    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3400    of iterations for vectorization.  -1 value means loop vectorization
3401    is not profitable.  This returned value may be used for dynamic
3402    profitability check.
3403
3404    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3405    for static check against estimated number of iterations.  */
3406
3407 static void
3408 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3409                                     int *ret_min_profitable_niters,
3410                                     int *ret_min_profitable_estimate)
3411 {
3412   int min_profitable_iters;
3413   int min_profitable_estimate;
3414   int peel_iters_prologue;
3415   int peel_iters_epilogue;
3416   unsigned vec_inside_cost = 0;
3417   int vec_outside_cost = 0;
3418   unsigned vec_prologue_cost = 0;
3419   unsigned vec_epilogue_cost = 0;
3420   int scalar_single_iter_cost = 0;
3421   int scalar_outside_cost = 0;
3422   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3423   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3424   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3425
3426   /* Cost model disabled.  */
3427   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3428     {
3429       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3430       *ret_min_profitable_niters = 0;
3431       *ret_min_profitable_estimate = 0;
3432       return;
3433     }
3434
3435   /* Requires loop versioning tests to handle misalignment.  */
3436   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3437     {
3438       /*  FIXME: Make cost depend on complexity of individual check.  */
3439       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3440       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3441                             vect_prologue);
3442       dump_printf (MSG_NOTE,
3443                    "cost model: Adding cost of checks for loop "
3444                    "versioning to treat misalignment.\n");
3445     }
3446
3447   /* Requires loop versioning with alias checks.  */
3448   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3449     {
3450       /*  FIXME: Make cost depend on complexity of individual check.  */
3451       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3452       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3453                             vect_prologue);
3454       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3455       if (len)
3456         /* Count LEN - 1 ANDs and LEN comparisons.  */
3457         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3458                               NULL, 0, vect_prologue);
3459       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3460       if (len)
3461         {
3462           /* Count LEN - 1 ANDs and LEN comparisons.  */
3463           unsigned int nstmts = len * 2 - 1;
3464           /* +1 for each bias that needs adding.  */
3465           for (unsigned int i = 0; i < len; ++i)
3466             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3467               nstmts += 1;
3468           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3469                                 NULL, 0, vect_prologue);
3470         }
3471       dump_printf (MSG_NOTE,
3472                    "cost model: Adding cost of checks for loop "
3473                    "versioning aliasing.\n");
3474     }
3475
3476   /* Requires loop versioning with niter checks.  */
3477   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3478     {
3479       /*  FIXME: Make cost depend on complexity of individual check.  */
3480       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3481                             vect_prologue);
3482       dump_printf (MSG_NOTE,
3483                    "cost model: Adding cost of checks for loop "
3484                    "versioning niters.\n");
3485     }
3486
3487   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3488     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3489                           vect_prologue);
3490
3491   /* Count statements in scalar loop.  Using this as scalar cost for a single
3492      iteration for now.
3493
3494      TODO: Add outer loop support.
3495
3496      TODO: Consider assigning different costs to different scalar
3497      statements.  */
3498
3499   scalar_single_iter_cost
3500     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3501
3502   /* Add additional cost for the peeled instructions in prologue and epilogue
3503      loop.  (For fully-masked loops there will be no peeling.)
3504
3505      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3506      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3507
3508      TODO: Build an expression that represents peel_iters for prologue and
3509      epilogue to be used in a run-time test.  */
3510
3511   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3512     {
3513       peel_iters_prologue = 0;
3514       peel_iters_epilogue = 0;
3515
3516       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3517         {
3518           /* We need to peel exactly one iteration.  */
3519           peel_iters_epilogue += 1;
3520           stmt_info_for_cost *si;
3521           int j;
3522           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3523                             j, si)
3524             {
3525               struct _stmt_vec_info *stmt_info
3526                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3527               (void) add_stmt_cost (target_cost_data, si->count,
3528                                     si->kind, stmt_info, si->misalign,
3529                                     vect_epilogue);
3530             }
3531         }
3532     }
3533   else if (npeel < 0)
3534     {
3535       peel_iters_prologue = assumed_vf / 2;
3536       dump_printf (MSG_NOTE, "cost model: "
3537                    "prologue peel iters set to vf/2.\n");
3538
3539       /* If peeling for alignment is unknown, loop bound of main loop becomes
3540          unknown.  */
3541       peel_iters_epilogue = assumed_vf / 2;
3542       dump_printf (MSG_NOTE, "cost model: "
3543                    "epilogue peel iters set to vf/2 because "
3544                    "peeling for alignment is unknown.\n");
3545
3546       /* If peeled iterations are unknown, count a taken branch and a not taken
3547          branch per peeled loop. Even if scalar loop iterations are known,
3548          vector iterations are not known since peeled prologue iterations are
3549          not known. Hence guards remain the same.  */
3550       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3551                             NULL, 0, vect_prologue);
3552       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3553                             NULL, 0, vect_prologue);
3554       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3555                             NULL, 0, vect_epilogue);
3556       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3557                             NULL, 0, vect_epilogue);
3558       stmt_info_for_cost *si;
3559       int j;
3560       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3561         {
3562           struct _stmt_vec_info *stmt_info
3563             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3564           (void) add_stmt_cost (target_cost_data,
3565                                 si->count * peel_iters_prologue,
3566                                 si->kind, stmt_info, si->misalign,
3567                                 vect_prologue);
3568           (void) add_stmt_cost (target_cost_data,
3569                                 si->count * peel_iters_epilogue,
3570                                 si->kind, stmt_info, si->misalign,
3571                                 vect_epilogue);
3572         }
3573     }
3574   else
3575     {
3576       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3577       stmt_info_for_cost *si;
3578       int j;
3579       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3580
3581       prologue_cost_vec.create (2);
3582       epilogue_cost_vec.create (2);
3583       peel_iters_prologue = npeel;
3584
3585       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3586                                           &peel_iters_epilogue,
3587                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3588                                             (loop_vinfo),
3589                                           &prologue_cost_vec,
3590                                           &epilogue_cost_vec);
3591
3592       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3593         {
3594           struct _stmt_vec_info *stmt_info
3595             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3596           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3597                                 si->misalign, vect_prologue);
3598         }
3599
3600       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3601         {
3602           struct _stmt_vec_info *stmt_info
3603             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3604           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3605                                 si->misalign, vect_epilogue);
3606         }
3607
3608       prologue_cost_vec.release ();
3609       epilogue_cost_vec.release ();
3610     }
3611
3612   /* FORNOW: The scalar outside cost is incremented in one of the
3613      following ways:
3614
3615      1. The vectorizer checks for alignment and aliasing and generates
3616      a condition that allows dynamic vectorization.  A cost model
3617      check is ANDED with the versioning condition.  Hence scalar code
3618      path now has the added cost of the versioning check.
3619
3620        if (cost > th & versioning_check)
3621          jmp to vector code
3622
3623      Hence run-time scalar is incremented by not-taken branch cost.
3624
3625      2. The vectorizer then checks if a prologue is required.  If the
3626      cost model check was not done before during versioning, it has to
3627      be done before the prologue check.
3628
3629        if (cost <= th)
3630          prologue = scalar_iters
3631        if (prologue == 0)
3632          jmp to vector code
3633        else
3634          execute prologue
3635        if (prologue == num_iters)
3636          go to exit
3637
3638      Hence the run-time scalar cost is incremented by a taken branch,
3639      plus a not-taken branch, plus a taken branch cost.
3640
3641      3. The vectorizer then checks if an epilogue is required.  If the
3642      cost model check was not done before during prologue check, it
3643      has to be done with the epilogue check.
3644
3645        if (prologue == 0)
3646          jmp to vector code
3647        else
3648          execute prologue
3649        if (prologue == num_iters)
3650          go to exit
3651        vector code:
3652          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3653            jmp to epilogue
3654
3655      Hence the run-time scalar cost should be incremented by 2 taken
3656      branches.
3657
3658      TODO: The back end may reorder the BBS's differently and reverse
3659      conditions/branch directions.  Change the estimates below to
3660      something more reasonable.  */
3661
3662   /* If the number of iterations is known and we do not do versioning, we can
3663      decide whether to vectorize at compile time.  Hence the scalar version
3664      do not carry cost model guard costs.  */
3665   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3666       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3667     {
3668       /* Cost model check occurs at versioning.  */
3669       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3670         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3671       else
3672         {
3673           /* Cost model check occurs at prologue generation.  */
3674           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3675             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3676               + vect_get_stmt_cost (cond_branch_not_taken);
3677           /* Cost model check occurs at epilogue generation.  */
3678           else
3679             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3680         }
3681     }
3682
3683   /* Complete the target-specific cost calculations.  */
3684   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3685                &vec_inside_cost, &vec_epilogue_cost);
3686
3687   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3688
3689   if (dump_enabled_p ())
3690     {
3691       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3692       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3693                    vec_inside_cost);
3694       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3695                    vec_prologue_cost);
3696       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3697                    vec_epilogue_cost);
3698       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3699                    scalar_single_iter_cost);
3700       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3701                    scalar_outside_cost);
3702       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3703                    vec_outside_cost);
3704       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3705                    peel_iters_prologue);
3706       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3707                    peel_iters_epilogue);
3708     }
3709
3710   /* Calculate number of iterations required to make the vector version
3711      profitable, relative to the loop bodies only.  The following condition
3712      must hold true:
3713      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3714      where
3715      SIC = scalar iteration cost, VIC = vector iteration cost,
3716      VOC = vector outside cost, VF = vectorization factor,
3717      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3718      SOC = scalar outside cost for run time cost model check.  */
3719
3720   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3721     {
3722       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3723                               * assumed_vf
3724                               - vec_inside_cost * peel_iters_prologue
3725                               - vec_inside_cost * peel_iters_epilogue);
3726       if (min_profitable_iters <= 0)
3727         min_profitable_iters = 0;
3728       else
3729         {
3730           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3731                                    - vec_inside_cost);
3732
3733           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3734               <= (((int) vec_inside_cost * min_profitable_iters)
3735                   + (((int) vec_outside_cost - scalar_outside_cost)
3736                      * assumed_vf)))
3737             min_profitable_iters++;
3738         }
3739     }
3740   /* vector version will never be profitable.  */
3741   else
3742     {
3743       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3744         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3745                     "did not happen for a simd loop");
3746
3747       if (dump_enabled_p ())
3748         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3749                          "cost model: the vector iteration cost = %d "
3750                          "divided by the scalar iteration cost = %d "
3751                          "is greater or equal to the vectorization factor = %d"
3752                          ".\n",
3753                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3754       *ret_min_profitable_niters = -1;
3755       *ret_min_profitable_estimate = -1;
3756       return;
3757     }
3758
3759   dump_printf (MSG_NOTE,
3760                "  Calculated minimum iters for profitability: %d\n",
3761                min_profitable_iters);
3762
3763   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3764       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3765     /* We want the vectorized loop to execute at least once.  */
3766     min_profitable_iters = assumed_vf + peel_iters_prologue;
3767
3768   if (dump_enabled_p ())
3769     dump_printf_loc (MSG_NOTE, vect_location,
3770                      "  Runtime profitability threshold = %d\n",
3771                      min_profitable_iters);
3772
3773   *ret_min_profitable_niters = min_profitable_iters;
3774
3775   /* Calculate number of iterations required to make the vector version
3776      profitable, relative to the loop bodies only.
3777
3778      Non-vectorized variant is SIC * niters and it must win over vector
3779      variant on the expected loop trip count.  The following condition must hold true:
3780      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3781
3782   if (vec_outside_cost <= 0)
3783     min_profitable_estimate = 0;
3784   else
3785     {
3786       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3787                                  * assumed_vf
3788                                  - vec_inside_cost * peel_iters_prologue
3789                                  - vec_inside_cost * peel_iters_epilogue)
3790                                  / ((scalar_single_iter_cost * assumed_vf)
3791                                    - vec_inside_cost);
3792     }
3793   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3794   if (dump_enabled_p ())
3795     dump_printf_loc (MSG_NOTE, vect_location,
3796                      "  Static estimate profitability threshold = %d\n",
3797                      min_profitable_estimate);
3798
3799   *ret_min_profitable_estimate = min_profitable_estimate;
3800 }
3801
3802 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3803    vector elements (not bits) for a vector with NELT elements.  */
3804 static void
3805 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3806                               vec_perm_builder *sel)
3807 {
3808   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3809      by vec_perm_indices.  */
3810   sel->new_vector (nelt, 1, 3);
3811   for (unsigned int i = 0; i < 3; i++)
3812     sel->quick_push (i + offset);
3813 }
3814
3815 /* Checks whether the target supports whole-vector shifts for vectors of mode
3816    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3817    it supports vec_perm_const with masks for all necessary shift amounts.  */
3818 static bool
3819 have_whole_vector_shift (machine_mode mode)
3820 {
3821   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3822     return true;
3823
3824   /* Variable-length vectors should be handled via the optab.  */
3825   unsigned int nelt;
3826   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3827     return false;
3828
3829   vec_perm_builder sel;
3830   vec_perm_indices indices;
3831   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3832     {
3833       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3834       indices.new_vector (sel, 2, nelt);
3835       if (!can_vec_perm_const_p (mode, indices, false))
3836         return false;
3837     }
3838   return true;
3839 }
3840
3841 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3842    functions. Design better to avoid maintenance issues.  */
3843
3844 /* Function vect_model_reduction_cost.
3845
3846    Models cost for a reduction operation, including the vector ops
3847    generated within the strip-mine loop, the initial definition before
3848    the loop, and the epilogue code that must be generated.  */
3849
3850 static void
3851 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3852                            int ncopies, stmt_vector_for_cost *cost_vec)
3853 {
3854   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3855   enum tree_code code;
3856   optab optab;
3857   tree vectype;
3858   gimple *orig_stmt;
3859   machine_mode mode;
3860   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3861   struct loop *loop = NULL;
3862
3863   if (loop_vinfo)
3864     loop = LOOP_VINFO_LOOP (loop_vinfo);
3865
3866   /* Condition reductions generate two reductions in the loop.  */
3867   vect_reduction_type reduction_type
3868     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3869   if (reduction_type == COND_REDUCTION)
3870     ncopies *= 2;
3871
3872   vectype = STMT_VINFO_VECTYPE (stmt_info);
3873   mode = TYPE_MODE (vectype);
3874   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3875
3876   if (!orig_stmt)
3877     orig_stmt = STMT_VINFO_STMT (stmt_info);
3878
3879   code = gimple_assign_rhs_code (orig_stmt);
3880
3881   if (reduction_type == EXTRACT_LAST_REDUCTION
3882       || reduction_type == FOLD_LEFT_REDUCTION)
3883     {
3884       /* No extra instructions needed in the prologue.  */
3885       prologue_cost = 0;
3886
3887       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3888         /* Count one reduction-like operation per vector.  */
3889         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3890                                         stmt_info, 0, vect_body);
3891       else
3892         {
3893           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3894           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3895           inside_cost = record_stmt_cost (cost_vec, nelements,
3896                                           vec_to_scalar, stmt_info, 0,
3897                                           vect_body);
3898           inside_cost += record_stmt_cost (cost_vec, nelements,
3899                                            scalar_stmt, stmt_info, 0,
3900                                            vect_body);
3901         }
3902     }
3903   else
3904     {
3905       /* Add in cost for initial definition.
3906          For cond reduction we have four vectors: initial index, step,
3907          initial result of the data reduction, initial value of the index
3908          reduction.  */
3909       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3910       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3911                                          scalar_to_vec, stmt_info, 0,
3912                                          vect_prologue);
3913
3914       /* Cost of reduction op inside loop.  */
3915       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3916                                       stmt_info, 0, vect_body);
3917     }
3918
3919   /* Determine cost of epilogue code.
3920
3921      We have a reduction operator that will reduce the vector in one statement.
3922      Also requires scalar extract.  */
3923
3924   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3925     {
3926       if (reduc_fn != IFN_LAST)
3927         {
3928           if (reduction_type == COND_REDUCTION)
3929             {
3930               /* An EQ stmt and an COND_EXPR stmt.  */
3931               epilogue_cost += record_stmt_cost (cost_vec, 2,
3932                                                  vector_stmt, stmt_info, 0,
3933                                                  vect_epilogue);
3934               /* Reduction of the max index and a reduction of the found
3935                  values.  */
3936               epilogue_cost += record_stmt_cost (cost_vec, 2,
3937                                                  vec_to_scalar, stmt_info, 0,
3938                                                  vect_epilogue);
3939               /* A broadcast of the max value.  */
3940               epilogue_cost += record_stmt_cost (cost_vec, 1,
3941                                                  scalar_to_vec, stmt_info, 0,
3942                                                  vect_epilogue);
3943             }
3944           else
3945             {
3946               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3947                                                  stmt_info, 0, vect_epilogue);
3948               epilogue_cost += record_stmt_cost (cost_vec, 1,
3949                                                  vec_to_scalar, stmt_info, 0,
3950                                                  vect_epilogue);
3951             }
3952         }
3953       else if (reduction_type == COND_REDUCTION)
3954         {
3955           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3956           /* Extraction of scalar elements.  */
3957           epilogue_cost += record_stmt_cost (cost_vec,
3958                                              2 * estimated_nunits,
3959                                              vec_to_scalar, stmt_info, 0,
3960                                              vect_epilogue);
3961           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3962           epilogue_cost += record_stmt_cost (cost_vec,
3963                                              2 * estimated_nunits - 3,
3964                                              scalar_stmt, stmt_info, 0,
3965                                              vect_epilogue);
3966         }
3967       else if (reduction_type == EXTRACT_LAST_REDUCTION
3968                || reduction_type == FOLD_LEFT_REDUCTION)
3969         /* No extra instructions need in the epilogue.  */
3970         ;
3971       else
3972         {
3973           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3974           tree bitsize =
3975             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3976           int element_bitsize = tree_to_uhwi (bitsize);
3977           int nelements = vec_size_in_bits / element_bitsize;
3978
3979           if (code == COND_EXPR)
3980             code = MAX_EXPR;
3981
3982           optab = optab_for_tree_code (code, vectype, optab_default);
3983
3984           /* We have a whole vector shift available.  */
3985           if (optab != unknown_optab
3986               && VECTOR_MODE_P (mode)
3987               && optab_handler (optab, mode) != CODE_FOR_nothing
3988               && have_whole_vector_shift (mode))
3989             {
3990               /* Final reduction via vector shifts and the reduction operator.
3991                  Also requires scalar extract.  */
3992               epilogue_cost += record_stmt_cost (cost_vec,
3993                                                  exact_log2 (nelements) * 2,
3994                                                  vector_stmt, stmt_info, 0,
3995                                                  vect_epilogue);
3996               epilogue_cost += record_stmt_cost (cost_vec, 1,
3997                                                  vec_to_scalar, stmt_info, 0,
3998                                                  vect_epilogue);
3999             }
4000           else
4001             /* Use extracts and reduction op for final reduction.  For N
4002                elements, we have N extracts and N-1 reduction ops.  */
4003             epilogue_cost += record_stmt_cost (cost_vec,
4004                                                nelements + nelements - 1,
4005                                                vector_stmt, stmt_info, 0,
4006                                                vect_epilogue);
4007         }
4008     }
4009
4010   if (dump_enabled_p ())
4011     dump_printf (MSG_NOTE,
4012                  "vect_model_reduction_cost: inside_cost = %d, "
4013                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4014                  prologue_cost, epilogue_cost);
4015 }
4016
4017
4018 /* Function vect_model_induction_cost.
4019
4020    Models cost for induction operations.  */
4021
4022 static void
4023 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4024                            stmt_vector_for_cost *cost_vec)
4025 {
4026   unsigned inside_cost, prologue_cost;
4027
4028   if (PURE_SLP_STMT (stmt_info))
4029     return;
4030
4031   /* loop cost for vec_loop.  */
4032   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4033                                   stmt_info, 0, vect_body);
4034
4035   /* prologue cost for vec_init and vec_step.  */
4036   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4037                                     stmt_info, 0, vect_prologue);
4038
4039   if (dump_enabled_p ())
4040     dump_printf_loc (MSG_NOTE, vect_location,
4041                      "vect_model_induction_cost: inside_cost = %d, "
4042                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4043 }
4044
4045
4046
4047 /* Function get_initial_def_for_reduction
4048
4049    Input:
4050    STMT - a stmt that performs a reduction operation in the loop.
4051    INIT_VAL - the initial value of the reduction variable
4052
4053    Output:
4054    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4055         of the reduction (used for adjusting the epilog - see below).
4056    Return a vector variable, initialized according to the operation that STMT
4057         performs. This vector will be used as the initial value of the
4058         vector of partial results.
4059
4060    Option1 (adjust in epilog): Initialize the vector as follows:
4061      add/bit or/xor:    [0,0,...,0,0]
4062      mult/bit and:      [1,1,...,1,1]
4063      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4064    and when necessary (e.g. add/mult case) let the caller know
4065    that it needs to adjust the result by init_val.
4066
4067    Option2: Initialize the vector as follows:
4068      add/bit or/xor:    [init_val,0,0,...,0]
4069      mult/bit and:      [init_val,1,1,...,1]
4070      min/max/cond_expr: [init_val,init_val,...,init_val]
4071    and no adjustments are needed.
4072
4073    For example, for the following code:
4074
4075    s = init_val;
4076    for (i=0;i<n;i++)
4077      s = s + a[i];
4078
4079    STMT is 's = s + a[i]', and the reduction variable is 's'.
4080    For a vector of 4 units, we want to return either [0,0,0,init_val],
4081    or [0,0,0,0] and let the caller know that it needs to adjust
4082    the result at the end by 'init_val'.
4083
4084    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4085    initialization vector is simpler (same element in all entries), if
4086    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4087
4088    A cost model should help decide between these two schemes.  */
4089
4090 tree
4091 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4092                                tree *adjustment_def)
4093 {
4094   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4095   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4096   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4097   tree scalar_type = TREE_TYPE (init_val);
4098   tree vectype = get_vectype_for_scalar_type (scalar_type);
4099   enum tree_code code = gimple_assign_rhs_code (stmt);
4100   tree def_for_init;
4101   tree init_def;
4102   bool nested_in_vect_loop = false;
4103   REAL_VALUE_TYPE real_init_val = dconst0;
4104   int int_init_val = 0;
4105   gimple *def_stmt = NULL;
4106   gimple_seq stmts = NULL;
4107
4108   gcc_assert (vectype);
4109
4110   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4111               || SCALAR_FLOAT_TYPE_P (scalar_type));
4112
4113   if (nested_in_vect_loop_p (loop, stmt))
4114     nested_in_vect_loop = true;
4115   else
4116     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4117
4118   /* In case of double reduction we only create a vector variable to be put
4119      in the reduction phi node.  The actual statement creation is done in
4120      vect_create_epilog_for_reduction.  */
4121   if (adjustment_def && nested_in_vect_loop
4122       && TREE_CODE (init_val) == SSA_NAME
4123       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4124       && gimple_code (def_stmt) == GIMPLE_PHI
4125       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4126       && vinfo_for_stmt (def_stmt)
4127       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4128           == vect_double_reduction_def)
4129     {
4130       *adjustment_def = NULL;
4131       return vect_create_destination_var (init_val, vectype);
4132     }
4133
4134   vect_reduction_type reduction_type
4135     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4136
4137   /* In case of a nested reduction do not use an adjustment def as
4138      that case is not supported by the epilogue generation correctly
4139      if ncopies is not one.  */
4140   if (adjustment_def && nested_in_vect_loop)
4141     {
4142       *adjustment_def = NULL;
4143       return vect_get_vec_def_for_operand (init_val, stmt);
4144     }
4145
4146   switch (code)
4147     {
4148     case WIDEN_SUM_EXPR:
4149     case DOT_PROD_EXPR:
4150     case SAD_EXPR:
4151     case PLUS_EXPR:
4152     case MINUS_EXPR:
4153     case BIT_IOR_EXPR:
4154     case BIT_XOR_EXPR:
4155     case MULT_EXPR:
4156     case BIT_AND_EXPR:
4157       {
4158         /* ADJUSTMENT_DEF is NULL when called from
4159            vect_create_epilog_for_reduction to vectorize double reduction.  */
4160         if (adjustment_def)
4161           *adjustment_def = init_val;
4162
4163         if (code == MULT_EXPR)
4164           {
4165             real_init_val = dconst1;
4166             int_init_val = 1;
4167           }
4168
4169         if (code == BIT_AND_EXPR)
4170           int_init_val = -1;
4171
4172         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4173           def_for_init = build_real (scalar_type, real_init_val);
4174         else
4175           def_for_init = build_int_cst (scalar_type, int_init_val);
4176
4177         if (adjustment_def)
4178           /* Option1: the first element is '0' or '1' as well.  */
4179           init_def = gimple_build_vector_from_val (&stmts, vectype,
4180                                                    def_for_init);
4181         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4182           {
4183             /* Option2 (variable length): the first element is INIT_VAL.  */
4184             init_def = build_vector_from_val (vectype, def_for_init);
4185             gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4186                                                       2, init_def, init_val);
4187             init_def = make_ssa_name (vectype);
4188             gimple_call_set_lhs (call, init_def);
4189             gimple_seq_add_stmt (&stmts, call);
4190           }
4191         else
4192           {
4193             /* Option2: the first element is INIT_VAL.  */
4194             tree_vector_builder elts (vectype, 1, 2);
4195             elts.quick_push (init_val);
4196             elts.quick_push (def_for_init);
4197             init_def = gimple_build_vector (&stmts, &elts);
4198           }
4199       }
4200       break;
4201
4202     case MIN_EXPR:
4203     case MAX_EXPR:
4204     case COND_EXPR:
4205       {
4206         if (adjustment_def)
4207           {
4208             *adjustment_def = NULL_TREE;
4209             if (reduction_type != COND_REDUCTION
4210                 && reduction_type != EXTRACT_LAST_REDUCTION)
4211               {
4212                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4213                 break;
4214               }
4215           }
4216         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4217         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4218       }
4219       break;
4220
4221     default:
4222       gcc_unreachable ();
4223     }
4224
4225   if (stmts)
4226     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4227   return init_def;
4228 }
4229
4230 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4231    NUMBER_OF_VECTORS is the number of vector defs to create.
4232    If NEUTRAL_OP is nonnull, introducing extra elements of that
4233    value will not change the result.  */
4234
4235 static void
4236 get_initial_defs_for_reduction (slp_tree slp_node,
4237                                 vec<tree> *vec_oprnds,
4238                                 unsigned int number_of_vectors,
4239                                 bool reduc_chain, tree neutral_op)
4240 {
4241   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4242   gimple *stmt = stmts[0];
4243   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4244   unsigned HOST_WIDE_INT nunits;
4245   unsigned j, number_of_places_left_in_vector;
4246   tree vector_type;
4247   tree vop;
4248   int group_size = stmts.length ();
4249   unsigned int vec_num, i;
4250   unsigned number_of_copies = 1;
4251   vec<tree> voprnds;
4252   voprnds.create (number_of_vectors);
4253   struct loop *loop;
4254   auto_vec<tree, 16> permute_results;
4255
4256   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4257
4258   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4259
4260   loop = (gimple_bb (stmt))->loop_father;
4261   gcc_assert (loop);
4262   edge pe = loop_preheader_edge (loop);
4263
4264   gcc_assert (!reduc_chain || neutral_op);
4265
4266   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4267      created vectors. It is greater than 1 if unrolling is performed.
4268
4269      For example, we have two scalar operands, s1 and s2 (e.g., group of
4270      strided accesses of size two), while NUNITS is four (i.e., four scalars
4271      of this type can be packed in a vector).  The output vector will contain
4272      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4273      will be 2).
4274
4275      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4276      containing the operands.
4277
4278      For example, NUNITS is four as before, and the group size is 8
4279      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4280      {s5, s6, s7, s8}.  */
4281
4282   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4283     nunits = group_size;
4284
4285   number_of_copies = nunits * number_of_vectors / group_size;
4286
4287   number_of_places_left_in_vector = nunits;
4288   bool constant_p = true;
4289   tree_vector_builder elts (vector_type, nunits, 1);
4290   elts.quick_grow (nunits);
4291   for (j = 0; j < number_of_copies; j++)
4292     {
4293       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4294         {
4295           tree op;
4296           /* Get the def before the loop.  In reduction chain we have only
4297              one initial value.  */
4298           if ((j != (number_of_copies - 1)
4299                || (reduc_chain && i != 0))
4300               && neutral_op)
4301             op = neutral_op;
4302           else
4303             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4304
4305           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4306           number_of_places_left_in_vector--;
4307           elts[number_of_places_left_in_vector] = op;
4308           if (!CONSTANT_CLASS_P (op))
4309             constant_p = false;
4310
4311           if (number_of_places_left_in_vector == 0)
4312             {
4313               gimple_seq ctor_seq = NULL;
4314               tree init;
4315               if (constant_p && !neutral_op
4316                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4317                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4318                 /* Build the vector directly from ELTS.  */
4319                 init = gimple_build_vector (&ctor_seq, &elts);
4320               else if (neutral_op)
4321                 {
4322                   /* Build a vector of the neutral value and shift the
4323                      other elements into place.  */
4324                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4325                                                        neutral_op);
4326                   int k = nunits;
4327                   while (k > 0 && elts[k - 1] == neutral_op)
4328                     k -= 1;
4329                   while (k > 0)
4330                     {
4331                       k -= 1;
4332                       gcall *call = gimple_build_call_internal
4333                         (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4334                       init = make_ssa_name (vector_type);
4335                       gimple_call_set_lhs (call, init);
4336                       gimple_seq_add_stmt (&ctor_seq, call);
4337                     }
4338                 }
4339               else
4340                 {
4341                   /* First time round, duplicate ELTS to fill the
4342                      required number of vectors, then cherry pick the
4343                      appropriate result for each iteration.  */
4344                   if (vec_oprnds->is_empty ())
4345                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4346                                               number_of_vectors,
4347                                               permute_results);
4348                   init = permute_results[number_of_vectors - j - 1];
4349                 }
4350               if (ctor_seq != NULL)
4351                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4352               voprnds.quick_push (init);
4353
4354               number_of_places_left_in_vector = nunits;
4355               elts.new_vector (vector_type, nunits, 1);
4356               elts.quick_grow (nunits);
4357               constant_p = true;
4358             }
4359         }
4360     }
4361
4362   /* Since the vectors are created in the reverse order, we should invert
4363      them.  */
4364   vec_num = voprnds.length ();
4365   for (j = vec_num; j != 0; j--)
4366     {
4367       vop = voprnds[j - 1];
4368       vec_oprnds->quick_push (vop);
4369     }
4370
4371   voprnds.release ();
4372
4373   /* In case that VF is greater than the unrolling factor needed for the SLP
4374      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4375      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4376      to replicate the vectors.  */
4377   tree neutral_vec = NULL;
4378   while (number_of_vectors > vec_oprnds->length ())
4379     {
4380       if (neutral_op)
4381         {
4382           if (!neutral_vec)
4383             {
4384               gimple_seq ctor_seq = NULL;
4385               neutral_vec = gimple_build_vector_from_val
4386                 (&ctor_seq, vector_type, neutral_op);
4387               if (ctor_seq != NULL)
4388                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4389             }
4390           vec_oprnds->quick_push (neutral_vec);
4391         }
4392       else
4393         {
4394           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4395             vec_oprnds->quick_push (vop);
4396         }
4397     }
4398 }
4399
4400
4401 /* Function vect_create_epilog_for_reduction
4402
4403    Create code at the loop-epilog to finalize the result of a reduction
4404    computation.
4405
4406    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4407      reduction statements.
4408    STMT is the scalar reduction stmt that is being vectorized.
4409    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4410      number of elements that we can fit in a vectype (nunits).  In this case
4411      we have to generate more than one vector stmt - i.e - we need to "unroll"
4412      the vector stmt by a factor VF/nunits.  For more details see documentation
4413      in vectorizable_operation.
4414    REDUC_FN is the internal function for the epilog reduction.
4415    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4416      computation.
4417    REDUC_INDEX is the index of the operand in the right hand side of the
4418      statement that is defined by REDUCTION_PHI.
4419    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4420    SLP_NODE is an SLP node containing a group of reduction statements. The
4421      first one in this group is STMT.
4422    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4423      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4424      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4425      any value of the IV in the loop.
4426    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4427    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4428      null if this is not an SLP reduction
4429
4430    This function:
4431    1. Creates the reduction def-use cycles: sets the arguments for
4432       REDUCTION_PHIS:
4433       The loop-entry argument is the vectorized initial-value of the reduction.
4434       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4435       sums.
4436    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4437       by calling the function specified by REDUC_FN if available, or by
4438       other means (whole-vector shifts or a scalar loop).
4439       The function also creates a new phi node at the loop exit to preserve
4440       loop-closed form, as illustrated below.
4441
4442      The flow at the entry to this function:
4443
4444         loop:
4445           vec_def = phi <null, null>            # REDUCTION_PHI
4446           VECT_DEF = vector_stmt                # vectorized form of STMT
4447           s_loop = scalar_stmt                  # (scalar) STMT
4448         loop_exit:
4449           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4450           use <s_out0>
4451           use <s_out0>
4452
4453      The above is transformed by this function into:
4454
4455         loop:
4456           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4457           VECT_DEF = vector_stmt                # vectorized form of STMT
4458           s_loop = scalar_stmt                  # (scalar) STMT
4459         loop_exit:
4460           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4461           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4462           v_out2 = reduce <v_out1>
4463           s_out3 = extract_field <v_out2, 0>
4464           s_out4 = adjust_result <s_out3>
4465           use <s_out4>
4466           use <s_out4>
4467 */
4468
4469 static void
4470 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4471                                   gimple *reduc_def_stmt,
4472                                   int ncopies, internal_fn reduc_fn,
4473                                   vec<gimple *> reduction_phis,
4474                                   bool double_reduc,
4475                                   slp_tree slp_node,
4476                                   slp_instance slp_node_instance,
4477                                   tree induc_val, enum tree_code induc_code,
4478                                   tree neutral_op)
4479 {
4480   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4481   stmt_vec_info prev_phi_info;
4482   tree vectype;
4483   machine_mode mode;
4484   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4485   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4486   basic_block exit_bb;
4487   tree scalar_dest;
4488   tree scalar_type;
4489   gimple *new_phi = NULL, *phi;
4490   gimple_stmt_iterator exit_gsi;
4491   tree vec_dest;
4492   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4493   gimple *epilog_stmt = NULL;
4494   enum tree_code code = gimple_assign_rhs_code (stmt);
4495   gimple *exit_phi;
4496   tree bitsize;
4497   tree adjustment_def = NULL;
4498   tree vec_initial_def = NULL;
4499   tree expr, def, initial_def = NULL;
4500   tree orig_name, scalar_result;
4501   imm_use_iterator imm_iter, phi_imm_iter;
4502   use_operand_p use_p, phi_use_p;
4503   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4504   bool nested_in_vect_loop = false;
4505   auto_vec<gimple *> new_phis;
4506   auto_vec<gimple *> inner_phis;
4507   enum vect_def_type dt = vect_unknown_def_type;
4508   int j, i;
4509   auto_vec<tree> scalar_results;
4510   unsigned int group_size = 1, k, ratio;
4511   auto_vec<tree> vec_initial_defs;
4512   auto_vec<gimple *> phis;
4513   bool slp_reduc = false;
4514   bool direct_slp_reduc;
4515   tree new_phi_result;
4516   gimple *inner_phi = NULL;
4517   tree induction_index = NULL_TREE;
4518
4519   if (slp_node)
4520     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4521
4522   if (nested_in_vect_loop_p (loop, stmt))
4523     {
4524       outer_loop = loop;
4525       loop = loop->inner;
4526       nested_in_vect_loop = true;
4527       gcc_assert (!slp_node);
4528     }
4529
4530   vectype = STMT_VINFO_VECTYPE (stmt_info);
4531   gcc_assert (vectype);
4532   mode = TYPE_MODE (vectype);
4533
4534   /* 1. Create the reduction def-use cycle:
4535      Set the arguments of REDUCTION_PHIS, i.e., transform
4536
4537         loop:
4538           vec_def = phi <null, null>            # REDUCTION_PHI
4539           VECT_DEF = vector_stmt                # vectorized form of STMT
4540           ...
4541
4542      into:
4543
4544         loop:
4545           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4546           VECT_DEF = vector_stmt                # vectorized form of STMT
4547           ...
4548
4549      (in case of SLP, do it for all the phis). */
4550
4551   /* Get the loop-entry arguments.  */
4552   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4553   if (slp_node)
4554     {
4555       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4556       vec_initial_defs.reserve (vec_num);
4557       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4558                                       &vec_initial_defs, vec_num,
4559                                       GROUP_FIRST_ELEMENT (stmt_info),
4560                                       neutral_op);
4561     }
4562   else
4563     {
4564       /* Get at the scalar def before the loop, that defines the initial value
4565          of the reduction variable.  */
4566       gimple *def_stmt;
4567       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4568                                            loop_preheader_edge (loop));
4569       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4570          and we can't use zero for induc_val, use initial_def.  Similarly
4571          for REDUC_MIN and initial_def larger than the base.  */
4572       if (TREE_CODE (initial_def) == INTEGER_CST
4573           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4574               == INTEGER_INDUC_COND_REDUCTION)
4575           && !integer_zerop (induc_val)
4576           && ((induc_code == MAX_EXPR
4577                && tree_int_cst_lt (initial_def, induc_val))
4578               || (induc_code == MIN_EXPR
4579                   && tree_int_cst_lt (induc_val, initial_def))))
4580         induc_val = initial_def;
4581       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4582       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4583                                                        &adjustment_def);
4584       vec_initial_defs.create (1);
4585       vec_initial_defs.quick_push (vec_initial_def);
4586     }
4587
4588   /* Set phi nodes arguments.  */
4589   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4590     {
4591       tree vec_init_def = vec_initial_defs[i];
4592       tree def = vect_defs[i];
4593       for (j = 0; j < ncopies; j++)
4594         {
4595           if (j != 0)
4596             {
4597               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4598               if (nested_in_vect_loop)
4599                 vec_init_def
4600                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4601                                                     vec_init_def);
4602             }
4603
4604           /* Set the loop-entry arg of the reduction-phi.  */
4605
4606           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4607               == INTEGER_INDUC_COND_REDUCTION)
4608             {
4609               /* Initialise the reduction phi to zero.  This prevents initial
4610                  values of non-zero interferring with the reduction op.  */
4611               gcc_assert (ncopies == 1);
4612               gcc_assert (i == 0);
4613
4614               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4615               tree induc_val_vec
4616                 = build_vector_from_val (vec_init_def_type, induc_val);
4617
4618               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4619                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4620             }
4621           else
4622             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4623                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4624
4625           /* Set the loop-latch arg for the reduction-phi.  */
4626           if (j > 0)
4627             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4628
4629           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4630                        UNKNOWN_LOCATION);
4631
4632           if (dump_enabled_p ())
4633             {
4634               dump_printf_loc (MSG_NOTE, vect_location,
4635                                "transform reduction: created def-use cycle: ");
4636               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4637               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4638             }
4639         }
4640     }
4641
4642   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4643      which is updated with the current index of the loop for every match of
4644      the original loop's cond_expr (VEC_STMT).  This results in a vector
4645      containing the last time the condition passed for that vector lane.
4646      The first match will be a 1 to allow 0 to be used for non-matching
4647      indexes.  If there are no matches at all then the vector will be all
4648      zeroes.  */
4649   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4650     {
4651       tree indx_before_incr, indx_after_incr;
4652       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4653
4654       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4655       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4656
4657       int scalar_precision
4658         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4659       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4660       tree cr_index_vector_type = build_vector_type
4661         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4662
4663       /* First we create a simple vector induction variable which starts
4664          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4665          vector size (STEP).  */
4666
4667       /* Create a {1,2,3,...} vector.  */
4668       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4669
4670       /* Create a vector of the step value.  */
4671       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4672       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4673
4674       /* Create an induction variable.  */
4675       gimple_stmt_iterator incr_gsi;
4676       bool insert_after;
4677       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4678       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4679                  insert_after, &indx_before_incr, &indx_after_incr);
4680
4681       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4682          filled with zeros (VEC_ZERO).  */
4683
4684       /* Create a vector of 0s.  */
4685       tree zero = build_zero_cst (cr_index_scalar_type);
4686       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4687
4688       /* Create a vector phi node.  */
4689       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4690       new_phi = create_phi_node (new_phi_tree, loop->header);
4691       set_vinfo_for_stmt (new_phi,
4692                           new_stmt_vec_info (new_phi, loop_vinfo));
4693       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4694                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4695
4696       /* Now take the condition from the loops original cond_expr
4697          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4698          every match uses values from the induction variable
4699          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4700          (NEW_PHI_TREE).
4701          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4702          the new cond_expr (INDEX_COND_EXPR).  */
4703
4704       /* Duplicate the condition from vec_stmt.  */
4705       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4706
4707       /* Create a conditional, where the condition is taken from vec_stmt
4708          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4709          else is the phi (NEW_PHI_TREE).  */
4710       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4711                                      ccompare, indx_before_incr,
4712                                      new_phi_tree);
4713       induction_index = make_ssa_name (cr_index_vector_type);
4714       gimple *index_condition = gimple_build_assign (induction_index,
4715                                                      index_cond_expr);
4716       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4717       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4718                                                         loop_vinfo);
4719       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4720       set_vinfo_for_stmt (index_condition, index_vec_info);
4721
4722       /* Update the phi with the vec cond.  */
4723       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4724                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4725     }
4726
4727   /* 2. Create epilog code.
4728         The reduction epilog code operates across the elements of the vector
4729         of partial results computed by the vectorized loop.
4730         The reduction epilog code consists of:
4731
4732         step 1: compute the scalar result in a vector (v_out2)
4733         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4734         step 3: adjust the scalar result (s_out3) if needed.
4735
4736         Step 1 can be accomplished using one the following three schemes:
4737           (scheme 1) using reduc_fn, if available.
4738           (scheme 2) using whole-vector shifts, if available.
4739           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4740                      combined.
4741
4742           The overall epilog code looks like this:
4743
4744           s_out0 = phi <s_loop>         # original EXIT_PHI
4745           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4746           v_out2 = reduce <v_out1>              # step 1
4747           s_out3 = extract_field <v_out2, 0>    # step 2
4748           s_out4 = adjust_result <s_out3>       # step 3
4749
4750           (step 3 is optional, and steps 1 and 2 may be combined).
4751           Lastly, the uses of s_out0 are replaced by s_out4.  */
4752
4753
4754   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4755          v_out1 = phi <VECT_DEF>
4756          Store them in NEW_PHIS.  */
4757
4758   exit_bb = single_exit (loop)->dest;
4759   prev_phi_info = NULL;
4760   new_phis.create (vect_defs.length ());
4761   FOR_EACH_VEC_ELT (vect_defs, i, def)
4762     {
4763       for (j = 0; j < ncopies; j++)
4764         {
4765           tree new_def = copy_ssa_name (def);
4766           phi = create_phi_node (new_def, exit_bb);
4767           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4768           if (j == 0)
4769             new_phis.quick_push (phi);
4770           else
4771             {
4772               def = vect_get_vec_def_for_stmt_copy (dt, def);
4773               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4774             }
4775
4776           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4777           prev_phi_info = vinfo_for_stmt (phi);
4778         }
4779     }
4780
4781   /* The epilogue is created for the outer-loop, i.e., for the loop being
4782      vectorized.  Create exit phis for the outer loop.  */
4783   if (double_reduc)
4784     {
4785       loop = outer_loop;
4786       exit_bb = single_exit (loop)->dest;
4787       inner_phis.create (vect_defs.length ());
4788       FOR_EACH_VEC_ELT (new_phis, i, phi)
4789         {
4790           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4791           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4792           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4793                            PHI_RESULT (phi));
4794           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4795                                                             loop_vinfo));
4796           inner_phis.quick_push (phi);
4797           new_phis[i] = outer_phi;
4798           prev_phi_info = vinfo_for_stmt (outer_phi);
4799           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4800             {
4801               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4802               new_result = copy_ssa_name (PHI_RESULT (phi));
4803               outer_phi = create_phi_node (new_result, exit_bb);
4804               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4805                                PHI_RESULT (phi));
4806               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4807                                                                 loop_vinfo));
4808               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4809               prev_phi_info = vinfo_for_stmt (outer_phi);
4810             }
4811         }
4812     }
4813
4814   exit_gsi = gsi_after_labels (exit_bb);
4815
4816   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4817          (i.e. when reduc_fn is not available) and in the final adjustment
4818          code (if needed).  Also get the original scalar reduction variable as
4819          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4820          represents a reduction pattern), the tree-code and scalar-def are
4821          taken from the original stmt that the pattern-stmt (STMT) replaces.
4822          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4823          are taken from STMT.  */
4824
4825   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4826   if (!orig_stmt)
4827     {
4828       /* Regular reduction  */
4829       orig_stmt = stmt;
4830     }
4831   else
4832     {
4833       /* Reduction pattern  */
4834       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4835       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4836       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4837     }
4838
4839   code = gimple_assign_rhs_code (orig_stmt);
4840   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4841      partial results are added and not subtracted.  */
4842   if (code == MINUS_EXPR)
4843     code = PLUS_EXPR;
4844
4845   scalar_dest = gimple_assign_lhs (orig_stmt);
4846   scalar_type = TREE_TYPE (scalar_dest);
4847   scalar_results.create (group_size);
4848   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4849   bitsize = TYPE_SIZE (scalar_type);
4850
4851   /* In case this is a reduction in an inner-loop while vectorizing an outer
4852      loop - we don't need to extract a single scalar result at the end of the
4853      inner-loop (unless it is double reduction, i.e., the use of reduction is
4854      outside the outer-loop).  The final vector of partial results will be used
4855      in the vectorized outer-loop, or reduced to a scalar result at the end of
4856      the outer-loop.  */
4857   if (nested_in_vect_loop && !double_reduc)
4858     goto vect_finalize_reduction;
4859
4860   /* SLP reduction without reduction chain, e.g.,
4861      # a1 = phi <a2, a0>
4862      # b1 = phi <b2, b0>
4863      a2 = operation (a1)
4864      b2 = operation (b1)  */
4865   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4866
4867   /* True if we should implement SLP_REDUC using native reduction operations
4868      instead of scalar operations.  */
4869   direct_slp_reduc = (reduc_fn != IFN_LAST
4870                       && slp_reduc
4871                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4872
4873   /* In case of reduction chain, e.g.,
4874      # a1 = phi <a3, a0>
4875      a2 = operation (a1)
4876      a3 = operation (a2),
4877
4878      we may end up with more than one vector result.  Here we reduce them to
4879      one vector.  */
4880   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4881     {
4882       tree first_vect = PHI_RESULT (new_phis[0]);
4883       gassign *new_vec_stmt = NULL;
4884       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4885       for (k = 1; k < new_phis.length (); k++)
4886         {
4887           gimple *next_phi = new_phis[k];
4888           tree second_vect = PHI_RESULT (next_phi);
4889           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4890           new_vec_stmt = gimple_build_assign (tem, code,
4891                                               first_vect, second_vect);
4892           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4893           first_vect = tem;
4894         }
4895
4896       new_phi_result = first_vect;
4897       if (new_vec_stmt)
4898         {
4899           new_phis.truncate (0);
4900           new_phis.safe_push (new_vec_stmt);
4901         }
4902     }
4903   /* Likewise if we couldn't use a single defuse cycle.  */
4904   else if (ncopies > 1)
4905     {
4906       gcc_assert (new_phis.length () == 1);
4907       tree first_vect = PHI_RESULT (new_phis[0]);
4908       gassign *new_vec_stmt = NULL;
4909       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4910       gimple *next_phi = new_phis[0];
4911       for (int k = 1; k < ncopies; ++k)
4912         {
4913           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4914           tree second_vect = PHI_RESULT (next_phi);
4915           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4916           new_vec_stmt = gimple_build_assign (tem, code,
4917                                               first_vect, second_vect);
4918           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4919           first_vect = tem;
4920         }
4921       new_phi_result = first_vect;
4922       new_phis.truncate (0);
4923       new_phis.safe_push (new_vec_stmt);
4924     }
4925   else
4926     new_phi_result = PHI_RESULT (new_phis[0]);
4927
4928   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4929       && reduc_fn != IFN_LAST)
4930     {
4931       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4932          various data values where the condition matched and another vector
4933          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4934          need to extract the last matching index (which will be the index with
4935          highest value) and use this to index into the data vector.
4936          For the case where there were no matches, the data vector will contain
4937          all default values and the index vector will be all zeros.  */
4938
4939       /* Get various versions of the type of the vector of indexes.  */
4940       tree index_vec_type = TREE_TYPE (induction_index);
4941       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4942       tree index_scalar_type = TREE_TYPE (index_vec_type);
4943       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4944         (index_vec_type);
4945
4946       /* Get an unsigned integer version of the type of the data vector.  */
4947       int scalar_precision
4948         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4949       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4950       tree vectype_unsigned = build_vector_type
4951         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4952
4953       /* First we need to create a vector (ZERO_VEC) of zeros and another
4954          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4955          can create using a MAX reduction and then expanding.
4956          In the case where the loop never made any matches, the max index will
4957          be zero.  */
4958
4959       /* Vector of {0, 0, 0,...}.  */
4960       tree zero_vec = make_ssa_name (vectype);
4961       tree zero_vec_rhs = build_zero_cst (vectype);
4962       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4963       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4964
4965       /* Find maximum value from the vector of found indexes.  */
4966       tree max_index = make_ssa_name (index_scalar_type);
4967       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4968                                                           1, induction_index);
4969       gimple_call_set_lhs (max_index_stmt, max_index);
4970       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4971
4972       /* Vector of {max_index, max_index, max_index,...}.  */
4973       tree max_index_vec = make_ssa_name (index_vec_type);
4974       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4975                                                       max_index);
4976       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4977                                                         max_index_vec_rhs);
4978       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4979
4980       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4981          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4982          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4983          otherwise.  Only one value should match, resulting in a vector
4984          (VEC_COND) with one data value and the rest zeros.
4985          In the case where the loop never made any matches, every index will
4986          match, resulting in a vector with all data values (which will all be
4987          the default value).  */
4988
4989       /* Compare the max index vector to the vector of found indexes to find
4990          the position of the max value.  */
4991       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4992       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4993                                                       induction_index,
4994                                                       max_index_vec);
4995       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4996
4997       /* Use the compare to choose either values from the data vector or
4998          zero.  */
4999       tree vec_cond = make_ssa_name (vectype);
5000       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5001                                                    vec_compare, new_phi_result,
5002                                                    zero_vec);
5003       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5004
5005       /* Finally we need to extract the data value from the vector (VEC_COND)
5006          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5007          reduction, but because this doesn't exist, we can use a MAX reduction
5008          instead.  The data value might be signed or a float so we need to cast
5009          it first.
5010          In the case where the loop never made any matches, the data values are
5011          all identical, and so will reduce down correctly.  */
5012
5013       /* Make the matched data values unsigned.  */
5014       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5015       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5016                                        vec_cond);
5017       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5018                                                         VIEW_CONVERT_EXPR,
5019                                                         vec_cond_cast_rhs);
5020       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5021
5022       /* Reduce down to a scalar value.  */
5023       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5024       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5025                                                            1, vec_cond_cast);
5026       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5027       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5028
5029       /* Convert the reduced value back to the result type and set as the
5030          result.  */
5031       gimple_seq stmts = NULL;
5032       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5033                                data_reduc);
5034       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5035       scalar_results.safe_push (new_temp);
5036     }
5037   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5038            && reduc_fn == IFN_LAST)
5039     {
5040       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5041          idx = 0;
5042          idx_val = induction_index[0];
5043          val = data_reduc[0];
5044          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5045            if (induction_index[i] > idx_val)
5046              val = data_reduc[i], idx_val = induction_index[i];
5047          return val;  */
5048
5049       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5050       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5051       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5052       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5053       /* Enforced by vectorizable_reduction, which ensures we have target
5054          support before allowing a conditional reduction on variable-length
5055          vectors.  */
5056       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5057       tree idx_val = NULL_TREE, val = NULL_TREE;
5058       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5059         {
5060           tree old_idx_val = idx_val;
5061           tree old_val = val;
5062           idx_val = make_ssa_name (idx_eltype);
5063           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5064                                              build3 (BIT_FIELD_REF, idx_eltype,
5065                                                      induction_index,
5066                                                      bitsize_int (el_size),
5067                                                      bitsize_int (off)));
5068           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5069           val = make_ssa_name (data_eltype);
5070           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5071                                              build3 (BIT_FIELD_REF,
5072                                                      data_eltype,
5073                                                      new_phi_result,
5074                                                      bitsize_int (el_size),
5075                                                      bitsize_int (off)));
5076           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5077           if (off != 0)
5078             {
5079               tree new_idx_val = idx_val;
5080               tree new_val = val;
5081               if (off != v_size - el_size)
5082                 {
5083                   new_idx_val = make_ssa_name (idx_eltype);
5084                   epilog_stmt = gimple_build_assign (new_idx_val,
5085                                                      MAX_EXPR, idx_val,
5086                                                      old_idx_val);
5087                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5088                 }
5089               new_val = make_ssa_name (data_eltype);
5090               epilog_stmt = gimple_build_assign (new_val,
5091                                                  COND_EXPR,
5092                                                  build2 (GT_EXPR,
5093                                                          boolean_type_node,
5094                                                          idx_val,
5095                                                          old_idx_val),
5096                                                  val, old_val);
5097               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5098               idx_val = new_idx_val;
5099               val = new_val;
5100             }
5101         }
5102       /* Convert the reduced value back to the result type and set as the
5103          result.  */
5104       gimple_seq stmts = NULL;
5105       val = gimple_convert (&stmts, scalar_type, val);
5106       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5107       scalar_results.safe_push (val);
5108     }
5109
5110   /* 2.3 Create the reduction code, using one of the three schemes described
5111          above. In SLP we simply need to extract all the elements from the
5112          vector (without reducing them), so we use scalar shifts.  */
5113   else if (reduc_fn != IFN_LAST && !slp_reduc)
5114     {
5115       tree tmp;
5116       tree vec_elem_type;
5117
5118       /* Case 1:  Create:
5119          v_out2 = reduc_expr <v_out1>  */
5120
5121       if (dump_enabled_p ())
5122         dump_printf_loc (MSG_NOTE, vect_location,
5123                          "Reduce using direct vector reduction.\n");
5124
5125       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5126       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5127         {
5128           tree tmp_dest
5129             = vect_create_destination_var (scalar_dest, vec_elem_type);
5130           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5131                                                     new_phi_result);
5132           gimple_set_lhs (epilog_stmt, tmp_dest);
5133           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5134           gimple_set_lhs (epilog_stmt, new_temp);
5135           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5136
5137           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5138                                              new_temp);
5139         }
5140       else
5141         {
5142           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5143                                                     new_phi_result);
5144           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5145         }
5146
5147       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5148       gimple_set_lhs (epilog_stmt, new_temp);
5149       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5150
5151       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5152            == INTEGER_INDUC_COND_REDUCTION)
5153           && !operand_equal_p (initial_def, induc_val, 0))
5154         {
5155           /* Earlier we set the initial value to be a vector if induc_val
5156              values.  Check the result and if it is induc_val then replace
5157              with the original initial value, unless induc_val is
5158              the same as initial_def already.  */
5159           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5160                                   induc_val);
5161
5162           tmp = make_ssa_name (new_scalar_dest);
5163           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5164                                              initial_def, new_temp);
5165           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5166           new_temp = tmp;
5167         }
5168
5169       scalar_results.safe_push (new_temp);
5170     }
5171   else if (direct_slp_reduc)
5172     {
5173       /* Here we create one vector for each of the GROUP_SIZE results,
5174          with the elements for other SLP statements replaced with the
5175          neutral value.  We can then do a normal reduction on each vector.  */
5176
5177       /* Enforced by vectorizable_reduction.  */
5178       gcc_assert (new_phis.length () == 1);
5179       gcc_assert (pow2p_hwi (group_size));
5180
5181       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5182       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5183       gimple_seq seq = NULL;
5184
5185       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5186          and the same element size as VECTYPE.  */
5187       tree index = build_index_vector (vectype, 0, 1);
5188       tree index_type = TREE_TYPE (index);
5189       tree index_elt_type = TREE_TYPE (index_type);
5190       tree mask_type = build_same_sized_truth_vector_type (index_type);
5191
5192       /* Create a vector that, for each element, identifies which of
5193          the GROUP_SIZE results should use it.  */
5194       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5195       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5196                             build_vector_from_val (index_type, index_mask));
5197
5198       /* Get a neutral vector value.  This is simply a splat of the neutral
5199          scalar value if we have one, otherwise the initial scalar value
5200          is itself a neutral value.  */
5201       tree vector_identity = NULL_TREE;
5202       if (neutral_op)
5203         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5204                                                         neutral_op);
5205       for (unsigned int i = 0; i < group_size; ++i)
5206         {
5207           /* If there's no univeral neutral value, we can use the
5208              initial scalar value from the original PHI.  This is used
5209              for MIN and MAX reduction, for example.  */
5210           if (!neutral_op)
5211             {
5212               tree scalar_value
5213                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5214                                          loop_preheader_edge (loop));
5215               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5216                                                               scalar_value);
5217             }
5218
5219           /* Calculate the equivalent of:
5220
5221              sel[j] = (index[j] == i);
5222
5223              which selects the elements of NEW_PHI_RESULT that should
5224              be included in the result.  */
5225           tree compare_val = build_int_cst (index_elt_type, i);
5226           compare_val = build_vector_from_val (index_type, compare_val);
5227           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5228                                    index, compare_val);
5229
5230           /* Calculate the equivalent of:
5231
5232              vec = seq ? new_phi_result : vector_identity;
5233
5234              VEC is now suitable for a full vector reduction.  */
5235           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5236                                    sel, new_phi_result, vector_identity);
5237
5238           /* Do the reduction and convert it to the appropriate type.  */
5239           gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5240           tree scalar = make_ssa_name (TREE_TYPE (vectype));
5241           gimple_call_set_lhs (call, scalar);
5242           gimple_seq_add_stmt (&seq, call);
5243           scalar = gimple_convert (&seq, scalar_type, scalar);
5244           scalar_results.safe_push (scalar);
5245         }
5246       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5247     }
5248   else
5249     {
5250       bool reduce_with_shift;
5251       tree vec_temp;
5252
5253       /* COND reductions all do the final reduction with MAX_EXPR
5254          or MIN_EXPR.  */
5255       if (code == COND_EXPR)
5256         {
5257           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5258               == INTEGER_INDUC_COND_REDUCTION)
5259             code = induc_code;
5260           else
5261             code = MAX_EXPR;
5262         }
5263
5264       /* See if the target wants to do the final (shift) reduction
5265          in a vector mode of smaller size and first reduce upper/lower
5266          halves against each other.  */
5267       enum machine_mode mode1 = mode;
5268       tree vectype1 = vectype;
5269       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5270       unsigned sz1 = sz;
5271       if (!slp_reduc
5272           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5273         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5274
5275       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5276       reduce_with_shift = have_whole_vector_shift (mode1);
5277       if (!VECTOR_MODE_P (mode1))
5278         reduce_with_shift = false;
5279       else
5280         {
5281           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5282           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5283             reduce_with_shift = false;
5284         }
5285
5286       /* First reduce the vector to the desired vector size we should
5287          do shift reduction on by combining upper and lower halves.  */
5288       new_temp = new_phi_result;
5289       while (sz > sz1)
5290         {
5291           gcc_assert (!slp_reduc);
5292           sz /= 2;
5293           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5294
5295           /* The target has to make sure we support lowpart/highpart
5296              extraction, either via direct vector extract or through
5297              an integer mode punning.  */
5298           tree dst1, dst2;
5299           if (convert_optab_handler (vec_extract_optab,
5300                                      TYPE_MODE (TREE_TYPE (new_temp)),
5301                                      TYPE_MODE (vectype1))
5302               != CODE_FOR_nothing)
5303             {
5304               /* Extract sub-vectors directly once vec_extract becomes
5305                  a conversion optab.  */
5306               dst1 = make_ssa_name (vectype1);
5307               epilog_stmt
5308                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5309                                          build3 (BIT_FIELD_REF, vectype1,
5310                                                  new_temp, TYPE_SIZE (vectype1),
5311                                                  bitsize_int (0)));
5312               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5313               dst2 =  make_ssa_name (vectype1);
5314               epilog_stmt
5315                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5316                                          build3 (BIT_FIELD_REF, vectype1,
5317                                                  new_temp, TYPE_SIZE (vectype1),
5318                                                  bitsize_int (sz * BITS_PER_UNIT)));
5319               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5320             }
5321           else
5322             {
5323               /* Extract via punning to appropriately sized integer mode
5324                  vector.  */
5325               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5326                                                             1);
5327               tree etype = build_vector_type (eltype, 2);
5328               gcc_assert (convert_optab_handler (vec_extract_optab,
5329                                                  TYPE_MODE (etype),
5330                                                  TYPE_MODE (eltype))
5331                           != CODE_FOR_nothing);
5332               tree tem = make_ssa_name (etype);
5333               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5334                                                  build1 (VIEW_CONVERT_EXPR,
5335                                                          etype, new_temp));
5336               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5337               new_temp = tem;
5338               tem = make_ssa_name (eltype);
5339               epilog_stmt
5340                   = gimple_build_assign (tem, BIT_FIELD_REF,
5341                                          build3 (BIT_FIELD_REF, eltype,
5342                                                  new_temp, TYPE_SIZE (eltype),
5343                                                  bitsize_int (0)));
5344               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5345               dst1 = make_ssa_name (vectype1);
5346               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5347                                                  build1 (VIEW_CONVERT_EXPR,
5348                                                          vectype1, tem));
5349               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5350               tem = make_ssa_name (eltype);
5351               epilog_stmt
5352                   = gimple_build_assign (tem, BIT_FIELD_REF,
5353                                          build3 (BIT_FIELD_REF, eltype,
5354                                                  new_temp, TYPE_SIZE (eltype),
5355                                                  bitsize_int (sz * BITS_PER_UNIT)));
5356               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5357               dst2 =  make_ssa_name (vectype1);
5358               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5359                                                  build1 (VIEW_CONVERT_EXPR,
5360                                                          vectype1, tem));
5361               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5362             }
5363
5364           new_temp = make_ssa_name (vectype1);
5365           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5366           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5367         }
5368
5369       if (reduce_with_shift && !slp_reduc)
5370         {
5371           int element_bitsize = tree_to_uhwi (bitsize);
5372           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5373              for variable-length vectors and also requires direct target support
5374              for loop reductions.  */
5375           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5376           int nelements = vec_size_in_bits / element_bitsize;
5377           vec_perm_builder sel;
5378           vec_perm_indices indices;
5379
5380           int elt_offset;
5381
5382           tree zero_vec = build_zero_cst (vectype1);
5383           /* Case 2: Create:
5384              for (offset = nelements/2; offset >= 1; offset/=2)
5385                 {
5386                   Create:  va' = vec_shift <va, offset>
5387                   Create:  va = vop <va, va'>
5388                 }  */
5389
5390           tree rhs;
5391
5392           if (dump_enabled_p ())
5393             dump_printf_loc (MSG_NOTE, vect_location,
5394                              "Reduce using vector shifts\n");
5395
5396           mode1 = TYPE_MODE (vectype1);
5397           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5398           for (elt_offset = nelements / 2;
5399                elt_offset >= 1;
5400                elt_offset /= 2)
5401             {
5402               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5403               indices.new_vector (sel, 2, nelements);
5404               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5405               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5406                                                  new_temp, zero_vec, mask);
5407               new_name = make_ssa_name (vec_dest, epilog_stmt);
5408               gimple_assign_set_lhs (epilog_stmt, new_name);
5409               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5410
5411               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5412                                                  new_temp);
5413               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5414               gimple_assign_set_lhs (epilog_stmt, new_temp);
5415               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5416             }
5417
5418           /* 2.4  Extract the final scalar result.  Create:
5419              s_out3 = extract_field <v_out2, bitpos>  */
5420
5421           if (dump_enabled_p ())
5422             dump_printf_loc (MSG_NOTE, vect_location,
5423                              "extract scalar result\n");
5424
5425           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5426                         bitsize, bitsize_zero_node);
5427           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5428           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5429           gimple_assign_set_lhs (epilog_stmt, new_temp);
5430           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5431           scalar_results.safe_push (new_temp);
5432         }
5433       else
5434         {
5435           /* Case 3: Create:
5436              s = extract_field <v_out2, 0>
5437              for (offset = element_size;
5438                   offset < vector_size;
5439                   offset += element_size;)
5440                {
5441                  Create:  s' = extract_field <v_out2, offset>
5442                  Create:  s = op <s, s'>  // For non SLP cases
5443                }  */
5444
5445           if (dump_enabled_p ())
5446             dump_printf_loc (MSG_NOTE, vect_location,
5447                              "Reduce using scalar code.\n");
5448
5449           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5450           int element_bitsize = tree_to_uhwi (bitsize);
5451           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5452             {
5453               int bit_offset;
5454               if (gimple_code (new_phi) == GIMPLE_PHI)
5455                 vec_temp = PHI_RESULT (new_phi);
5456               else
5457                 vec_temp = gimple_assign_lhs (new_phi);
5458               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5459                                  bitsize_zero_node);
5460               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5461               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5462               gimple_assign_set_lhs (epilog_stmt, new_temp);
5463               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5464
5465               /* In SLP we don't need to apply reduction operation, so we just
5466                  collect s' values in SCALAR_RESULTS.  */
5467               if (slp_reduc)
5468                 scalar_results.safe_push (new_temp);
5469
5470               for (bit_offset = element_bitsize;
5471                    bit_offset < vec_size_in_bits;
5472                    bit_offset += element_bitsize)
5473                 {
5474                   tree bitpos = bitsize_int (bit_offset);
5475                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5476                                      bitsize, bitpos);
5477
5478                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5479                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5480                   gimple_assign_set_lhs (epilog_stmt, new_name);
5481                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5482
5483                   if (slp_reduc)
5484                     {
5485                       /* In SLP we don't need to apply reduction operation, so
5486                          we just collect s' values in SCALAR_RESULTS.  */
5487                       new_temp = new_name;
5488                       scalar_results.safe_push (new_name);
5489                     }
5490                   else
5491                     {
5492                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5493                                                          new_name, new_temp);
5494                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5495                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5496                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5497                     }
5498                 }
5499             }
5500
5501           /* The only case where we need to reduce scalar results in SLP, is
5502              unrolling.  If the size of SCALAR_RESULTS is greater than
5503              GROUP_SIZE, we reduce them combining elements modulo
5504              GROUP_SIZE.  */
5505           if (slp_reduc)
5506             {
5507               tree res, first_res, new_res;
5508               gimple *new_stmt;
5509
5510               /* Reduce multiple scalar results in case of SLP unrolling.  */
5511               for (j = group_size; scalar_results.iterate (j, &res);
5512                    j++)
5513                 {
5514                   first_res = scalar_results[j % group_size];
5515                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5516                                                   first_res, res);
5517                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5518                   gimple_assign_set_lhs (new_stmt, new_res);
5519                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5520                   scalar_results[j % group_size] = new_res;
5521                 }
5522             }
5523           else
5524             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5525             scalar_results.safe_push (new_temp);
5526         }
5527
5528       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5529            == INTEGER_INDUC_COND_REDUCTION)
5530           && !operand_equal_p (initial_def, induc_val, 0))
5531         {
5532           /* Earlier we set the initial value to be a vector if induc_val
5533              values.  Check the result and if it is induc_val then replace
5534              with the original initial value, unless induc_val is
5535              the same as initial_def already.  */
5536           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5537                                   induc_val);
5538
5539           tree tmp = make_ssa_name (new_scalar_dest);
5540           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5541                                              initial_def, new_temp);
5542           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5543           scalar_results[0] = tmp;
5544         }
5545     }
5546
5547 vect_finalize_reduction:
5548
5549   if (double_reduc)
5550     loop = loop->inner;
5551
5552   /* 2.5 Adjust the final result by the initial value of the reduction
5553          variable. (When such adjustment is not needed, then
5554          'adjustment_def' is zero).  For example, if code is PLUS we create:
5555          new_temp = loop_exit_def + adjustment_def  */
5556
5557   if (adjustment_def)
5558     {
5559       gcc_assert (!slp_reduc);
5560       if (nested_in_vect_loop)
5561         {
5562           new_phi = new_phis[0];
5563           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5564           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5565           new_dest = vect_create_destination_var (scalar_dest, vectype);
5566         }
5567       else
5568         {
5569           new_temp = scalar_results[0];
5570           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5571           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5572           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5573         }
5574
5575       epilog_stmt = gimple_build_assign (new_dest, expr);
5576       new_temp = make_ssa_name (new_dest, epilog_stmt);
5577       gimple_assign_set_lhs (epilog_stmt, new_temp);
5578       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5579       if (nested_in_vect_loop)
5580         {
5581           set_vinfo_for_stmt (epilog_stmt,
5582                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5583           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5584                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5585
5586           if (!double_reduc)
5587             scalar_results.quick_push (new_temp);
5588           else
5589             scalar_results[0] = new_temp;
5590         }
5591       else
5592         scalar_results[0] = new_temp;
5593
5594       new_phis[0] = epilog_stmt;
5595     }
5596
5597   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5598           phis with new adjusted scalar results, i.e., replace use <s_out0>
5599           with use <s_out4>.
5600
5601      Transform:
5602         loop_exit:
5603           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5604           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5605           v_out2 = reduce <v_out1>
5606           s_out3 = extract_field <v_out2, 0>
5607           s_out4 = adjust_result <s_out3>
5608           use <s_out0>
5609           use <s_out0>
5610
5611      into:
5612
5613         loop_exit:
5614           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5615           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5616           v_out2 = reduce <v_out1>
5617           s_out3 = extract_field <v_out2, 0>
5618           s_out4 = adjust_result <s_out3>
5619           use <s_out4>
5620           use <s_out4> */
5621
5622
5623   /* In SLP reduction chain we reduce vector results into one vector if
5624      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5625      the last stmt in the reduction chain, since we are looking for the loop
5626      exit phi node.  */
5627   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5628     {
5629       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5630       /* Handle reduction patterns.  */
5631       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5632         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5633
5634       scalar_dest = gimple_assign_lhs (dest_stmt);
5635       group_size = 1;
5636     }
5637
5638   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5639      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5640      need to match SCALAR_RESULTS with corresponding statements.  The first
5641      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5642      the first vector stmt, etc.
5643      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5644   if (group_size > new_phis.length ())
5645     {
5646       ratio = group_size / new_phis.length ();
5647       gcc_assert (!(group_size % new_phis.length ()));
5648     }
5649   else
5650     ratio = 1;
5651
5652   for (k = 0; k < group_size; k++)
5653     {
5654       if (k % ratio == 0)
5655         {
5656           epilog_stmt = new_phis[k / ratio];
5657           reduction_phi = reduction_phis[k / ratio];
5658           if (double_reduc)
5659             inner_phi = inner_phis[k / ratio];
5660         }
5661
5662       if (slp_reduc)
5663         {
5664           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5665
5666           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5667           /* SLP statements can't participate in patterns.  */
5668           gcc_assert (!orig_stmt);
5669           scalar_dest = gimple_assign_lhs (current_stmt);
5670         }
5671
5672       phis.create (3);
5673       /* Find the loop-closed-use at the loop exit of the original scalar
5674          result.  (The reduction result is expected to have two immediate uses -
5675          one at the latch block, and one at the loop exit).  */
5676       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5677         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5678             && !is_gimple_debug (USE_STMT (use_p)))
5679           phis.safe_push (USE_STMT (use_p));
5680
5681       /* While we expect to have found an exit_phi because of loop-closed-ssa
5682          form we can end up without one if the scalar cycle is dead.  */
5683
5684       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5685         {
5686           if (outer_loop)
5687             {
5688               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5689               gphi *vect_phi;
5690
5691               /* FORNOW. Currently not supporting the case that an inner-loop
5692                  reduction is not used in the outer-loop (but only outside the
5693                  outer-loop), unless it is double reduction.  */
5694               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5695                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5696                           || double_reduc);
5697
5698               if (double_reduc)
5699                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5700               else
5701                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5702               if (!double_reduc
5703                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5704                       != vect_double_reduction_def)
5705                 continue;
5706
5707               /* Handle double reduction:
5708
5709                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5710                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5711                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5712                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5713
5714                  At that point the regular reduction (stmt2 and stmt3) is
5715                  already vectorized, as well as the exit phi node, stmt4.
5716                  Here we vectorize the phi node of double reduction, stmt1, and
5717                  update all relevant statements.  */
5718
5719               /* Go through all the uses of s2 to find double reduction phi
5720                  node, i.e., stmt1 above.  */
5721               orig_name = PHI_RESULT (exit_phi);
5722               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5723                 {
5724                   stmt_vec_info use_stmt_vinfo;
5725                   stmt_vec_info new_phi_vinfo;
5726                   tree vect_phi_init, preheader_arg, vect_phi_res;
5727                   basic_block bb = gimple_bb (use_stmt);
5728                   gimple *use;
5729
5730                   /* Check that USE_STMT is really double reduction phi
5731                      node.  */
5732                   if (gimple_code (use_stmt) != GIMPLE_PHI
5733                       || gimple_phi_num_args (use_stmt) != 2
5734                       || bb->loop_father != outer_loop)
5735                     continue;
5736                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5737                   if (!use_stmt_vinfo
5738                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5739                           != vect_double_reduction_def)
5740                     continue;
5741
5742                   /* Create vector phi node for double reduction:
5743                      vs1 = phi <vs0, vs2>
5744                      vs1 was created previously in this function by a call to
5745                        vect_get_vec_def_for_operand and is stored in
5746                        vec_initial_def;
5747                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5748                      vs0 is created here.  */
5749
5750                   /* Create vector phi node.  */
5751                   vect_phi = create_phi_node (vec_initial_def, bb);
5752                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5753                                     loop_vec_info_for_loop (outer_loop));
5754                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5755
5756                   /* Create vs0 - initial def of the double reduction phi.  */
5757                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5758                                              loop_preheader_edge (outer_loop));
5759                   vect_phi_init = get_initial_def_for_reduction
5760                     (stmt, preheader_arg, NULL);
5761
5762                   /* Update phi node arguments with vs0 and vs2.  */
5763                   add_phi_arg (vect_phi, vect_phi_init,
5764                                loop_preheader_edge (outer_loop),
5765                                UNKNOWN_LOCATION);
5766                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5767                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5768                   if (dump_enabled_p ())
5769                     {
5770                       dump_printf_loc (MSG_NOTE, vect_location,
5771                                        "created double reduction phi node: ");
5772                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5773                     }
5774
5775                   vect_phi_res = PHI_RESULT (vect_phi);
5776
5777                   /* Replace the use, i.e., set the correct vs1 in the regular
5778                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5779                      loop is redundant.  */
5780                   use = reduction_phi;
5781                   for (j = 0; j < ncopies; j++)
5782                     {
5783                       edge pr_edge = loop_preheader_edge (loop);
5784                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5785                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5786                     }
5787                 }
5788             }
5789         }
5790
5791       phis.release ();
5792       if (nested_in_vect_loop)
5793         {
5794           if (double_reduc)
5795             loop = outer_loop;
5796           else
5797             continue;
5798         }
5799
5800       phis.create (3);
5801       /* Find the loop-closed-use at the loop exit of the original scalar
5802          result.  (The reduction result is expected to have two immediate uses,
5803          one at the latch block, and one at the loop exit).  For double
5804          reductions we are looking for exit phis of the outer loop.  */
5805       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5806         {
5807           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5808             {
5809               if (!is_gimple_debug (USE_STMT (use_p)))
5810                 phis.safe_push (USE_STMT (use_p));
5811             }
5812           else
5813             {
5814               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5815                 {
5816                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5817
5818                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5819                     {
5820                       if (!flow_bb_inside_loop_p (loop,
5821                                              gimple_bb (USE_STMT (phi_use_p)))
5822                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5823                         phis.safe_push (USE_STMT (phi_use_p));
5824                     }
5825                 }
5826             }
5827         }
5828
5829       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5830         {
5831           /* Replace the uses:  */
5832           orig_name = PHI_RESULT (exit_phi);
5833           scalar_result = scalar_results[k];
5834           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5835             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5836               SET_USE (use_p, scalar_result);
5837         }
5838
5839       phis.release ();
5840     }
5841 }
5842
5843 /* Return a vector of type VECTYPE that is equal to the vector select
5844    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5845    before GSI.  */
5846
5847 static tree
5848 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5849                      tree vec, tree identity)
5850 {
5851   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5852   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5853                                           mask, vec, identity);
5854   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5855   return cond;
5856 }
5857
5858 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5859    order, starting with LHS.  Insert the extraction statements before GSI and
5860    associate the new scalar SSA names with variable SCALAR_DEST.
5861    Return the SSA name for the result.  */
5862
5863 static tree
5864 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5865                        tree_code code, tree lhs, tree vector_rhs)
5866 {
5867   tree vectype = TREE_TYPE (vector_rhs);
5868   tree scalar_type = TREE_TYPE (vectype);
5869   tree bitsize = TYPE_SIZE (scalar_type);
5870   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5871   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5872
5873   for (unsigned HOST_WIDE_INT bit_offset = 0;
5874        bit_offset < vec_size_in_bits;
5875        bit_offset += element_bitsize)
5876     {
5877       tree bitpos = bitsize_int (bit_offset);
5878       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5879                          bitsize, bitpos);
5880
5881       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5882       rhs = make_ssa_name (scalar_dest, stmt);
5883       gimple_assign_set_lhs (stmt, rhs);
5884       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5885
5886       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5887       tree new_name = make_ssa_name (scalar_dest, stmt);
5888       gimple_assign_set_lhs (stmt, new_name);
5889       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5890       lhs = new_name;
5891     }
5892   return lhs;
5893 }
5894
5895 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5896    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5897    statement.  CODE is the operation performed by STMT and OPS are
5898    its scalar operands.  REDUC_INDEX is the index of the operand in
5899    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5900    implements in-order reduction, or IFN_LAST if we should open-code it.
5901    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5902    that should be used to control the operation in a fully-masked loop.  */
5903
5904 static bool
5905 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5906                                gimple **vec_stmt, slp_tree slp_node,
5907                                gimple *reduc_def_stmt,
5908                                tree_code code, internal_fn reduc_fn,
5909                                tree ops[3], tree vectype_in,
5910                                int reduc_index, vec_loop_masks *masks)
5911 {
5912   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5913   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5914   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5915   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5916   gimple *new_stmt = NULL;
5917
5918   int ncopies;
5919   if (slp_node)
5920     ncopies = 1;
5921   else
5922     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5923
5924   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5925   gcc_assert (ncopies == 1);
5926   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5927   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5928   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5929               == FOLD_LEFT_REDUCTION);
5930
5931   if (slp_node)
5932     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5933                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5934
5935   tree op0 = ops[1 - reduc_index];
5936
5937   int group_size = 1;
5938   gimple *scalar_dest_def;
5939   auto_vec<tree> vec_oprnds0;
5940   if (slp_node)
5941     {
5942       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5943       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5944       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5945     }
5946   else
5947     {
5948       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5949       vec_oprnds0.create (1);
5950       vec_oprnds0.quick_push (loop_vec_def0);
5951       scalar_dest_def = stmt;
5952     }
5953
5954   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5955   tree scalar_type = TREE_TYPE (scalar_dest);
5956   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5957
5958   int vec_num = vec_oprnds0.length ();
5959   gcc_assert (vec_num == 1 || slp_node);
5960   tree vec_elem_type = TREE_TYPE (vectype_out);
5961   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5962
5963   tree vector_identity = NULL_TREE;
5964   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5965     vector_identity = build_zero_cst (vectype_out);
5966
5967   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5968   int i;
5969   tree def0;
5970   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5971     {
5972       tree mask = NULL_TREE;
5973       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5974         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5975
5976       /* Handle MINUS by adding the negative.  */
5977       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5978         {
5979           tree negated = make_ssa_name (vectype_out);
5980           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5981           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5982           def0 = negated;
5983         }
5984
5985       if (mask)
5986         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5987                                     vector_identity);
5988
5989       /* On the first iteration the input is simply the scalar phi
5990          result, and for subsequent iterations it is the output of
5991          the preceding operation.  */
5992       if (reduc_fn != IFN_LAST)
5993         {
5994           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5995           /* For chained SLP reductions the output of the previous reduction
5996              operation serves as the input of the next. For the final statement
5997              the output cannot be a temporary - we reuse the original
5998              scalar destination of the last statement.  */
5999           if (i != vec_num - 1)
6000             {
6001               gimple_set_lhs (new_stmt, scalar_dest_var);
6002               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6003               gimple_set_lhs (new_stmt, reduc_var);
6004             }
6005         }
6006       else
6007         {
6008           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6009                                              reduc_var, def0);
6010           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6011           /* Remove the statement, so that we can use the same code paths
6012              as for statements that we've just created.  */
6013           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6014           gsi_remove (&tmp_gsi, false);
6015         }
6016
6017       if (i == vec_num - 1)
6018         {
6019           gimple_set_lhs (new_stmt, scalar_dest);
6020           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6021         }
6022       else
6023         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6024
6025       if (slp_node)
6026         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6027     }
6028
6029   if (!slp_node)
6030     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6031
6032   return true;
6033 }
6034
6035 /* Function is_nonwrapping_integer_induction.
6036
6037    Check if STMT (which is part of loop LOOP) both increments and
6038    does not cause overflow.  */
6039
6040 static bool
6041 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6042 {
6043   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6044   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6045   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6046   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6047   widest_int ni, max_loop_value, lhs_max;
6048   bool overflow = false;
6049
6050   /* Make sure the loop is integer based.  */
6051   if (TREE_CODE (base) != INTEGER_CST
6052       || TREE_CODE (step) != INTEGER_CST)
6053     return false;
6054
6055   /* Check that the max size of the loop will not wrap.  */
6056
6057   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6058     return true;
6059
6060   if (! max_stmt_executions (loop, &ni))
6061     return false;
6062
6063   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6064                             &overflow);
6065   if (overflow)
6066     return false;
6067
6068   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6069                             TYPE_SIGN (lhs_type), &overflow);
6070   if (overflow)
6071     return false;
6072
6073   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6074           <= TYPE_PRECISION (lhs_type));
6075 }
6076
6077 /* Function vectorizable_reduction.
6078
6079    Check if STMT performs a reduction operation that can be vectorized.
6080    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6081    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6082    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6083
6084    This function also handles reduction idioms (patterns) that have been
6085    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6086    of this form:
6087      X = pattern_expr (arg0, arg1, ..., X)
6088    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6089    sequence that had been detected and replaced by the pattern-stmt (STMT).
6090
6091    This function also handles reduction of condition expressions, for example:
6092      for (int i = 0; i < N; i++)
6093        if (a[i] < value)
6094          last = a[i];
6095    This is handled by vectorising the loop and creating an additional vector
6096    containing the loop indexes for which "a[i] < value" was true.  In the
6097    function epilogue this is reduced to a single max value and then used to
6098    index into the vector of results.
6099
6100    In some cases of reduction patterns, the type of the reduction variable X is
6101    different than the type of the other arguments of STMT.
6102    In such cases, the vectype that is used when transforming STMT into a vector
6103    stmt is different than the vectype that is used to determine the
6104    vectorization factor, because it consists of a different number of elements
6105    than the actual number of elements that are being operated upon in parallel.
6106
6107    For example, consider an accumulation of shorts into an int accumulator.
6108    On some targets it's possible to vectorize this pattern operating on 8
6109    shorts at a time (hence, the vectype for purposes of determining the
6110    vectorization factor should be V8HI); on the other hand, the vectype that
6111    is used to create the vector form is actually V4SI (the type of the result).
6112
6113    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6114    indicates what is the actual level of parallelism (V8HI in the example), so
6115    that the right vectorization factor would be derived.  This vectype
6116    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6117    be used to create the vectorized stmt.  The right vectype for the vectorized
6118    stmt is obtained from the type of the result X:
6119         get_vectype_for_scalar_type (TREE_TYPE (X))
6120
6121    This means that, contrary to "regular" reductions (or "regular" stmts in
6122    general), the following equation:
6123       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6124    does *NOT* necessarily hold for reduction patterns.  */
6125
6126 bool
6127 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6128                         gimple **vec_stmt, slp_tree slp_node,
6129                         slp_instance slp_node_instance,
6130                         stmt_vector_for_cost *cost_vec)
6131 {
6132   tree vec_dest;
6133   tree scalar_dest;
6134   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6135   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6136   tree vectype_in = NULL_TREE;
6137   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6138   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6139   enum tree_code code, orig_code;
6140   internal_fn reduc_fn;
6141   machine_mode vec_mode;
6142   int op_type;
6143   optab optab;
6144   tree new_temp = NULL_TREE;
6145   gimple *def_stmt;
6146   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6147   gimple *cond_reduc_def_stmt = NULL;
6148   enum tree_code cond_reduc_op_code = ERROR_MARK;
6149   tree scalar_type;
6150   bool is_simple_use;
6151   gimple *orig_stmt;
6152   stmt_vec_info orig_stmt_info = NULL;
6153   int i;
6154   int ncopies;
6155   int epilog_copies;
6156   stmt_vec_info prev_stmt_info, prev_phi_info;
6157   bool single_defuse_cycle = false;
6158   gimple *new_stmt = NULL;
6159   int j;
6160   tree ops[3];
6161   enum vect_def_type dts[3];
6162   bool nested_cycle = false, found_nested_cycle_def = false;
6163   bool double_reduc = false;
6164   basic_block def_bb;
6165   struct loop * def_stmt_loop, *outer_loop = NULL;
6166   tree def_arg;
6167   gimple *def_arg_stmt;
6168   auto_vec<tree> vec_oprnds0;
6169   auto_vec<tree> vec_oprnds1;
6170   auto_vec<tree> vec_oprnds2;
6171   auto_vec<tree> vect_defs;
6172   auto_vec<gimple *> phis;
6173   int vec_num;
6174   tree def0, tem;
6175   bool first_p = true;
6176   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6177   tree cond_reduc_val = NULL_TREE;
6178
6179   /* Make sure it was already recognized as a reduction computation.  */
6180   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6181       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6182     return false;
6183
6184   if (nested_in_vect_loop_p (loop, stmt))
6185     {
6186       outer_loop = loop;
6187       loop = loop->inner;
6188       nested_cycle = true;
6189     }
6190
6191   /* In case of reduction chain we switch to the first stmt in the chain, but
6192      we don't update STMT_INFO, since only the last stmt is marked as reduction
6193      and has reduction properties.  */
6194   if (GROUP_FIRST_ELEMENT (stmt_info)
6195       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6196     {
6197       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6198       first_p = false;
6199     }
6200
6201   if (gimple_code (stmt) == GIMPLE_PHI)
6202     {
6203       /* Analysis is fully done on the reduction stmt invocation.  */
6204       if (! vec_stmt)
6205         {
6206           if (slp_node)
6207             slp_node_instance->reduc_phis = slp_node;
6208
6209           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6210           return true;
6211         }
6212
6213       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6214         /* Leave the scalar phi in place.  Note that checking
6215            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6216            for reductions involving a single statement.  */
6217         return true;
6218
6219       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6220       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6221         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6222
6223       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6224           == EXTRACT_LAST_REDUCTION)
6225         /* Leave the scalar phi in place.  */
6226         return true;
6227
6228       gcc_assert (is_gimple_assign (reduc_stmt));
6229       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6230         {
6231           tree op = gimple_op (reduc_stmt, k);
6232           if (op == gimple_phi_result (stmt))
6233             continue;
6234           if (k == 1
6235               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6236             continue;
6237           if (!vectype_in
6238               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6239                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6240             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6241           break;
6242         }
6243       gcc_assert (vectype_in);
6244
6245       if (slp_node)
6246         ncopies = 1;
6247       else
6248         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6249
6250       use_operand_p use_p;
6251       gimple *use_stmt;
6252       if (ncopies > 1
6253           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6254               <= vect_used_only_live)
6255           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6256           && (use_stmt == reduc_stmt
6257               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6258                   == reduc_stmt)))
6259         single_defuse_cycle = true;
6260
6261       /* Create the destination vector  */
6262       scalar_dest = gimple_assign_lhs (reduc_stmt);
6263       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6264
6265       if (slp_node)
6266         /* The size vect_schedule_slp_instance computes is off for us.  */
6267         vec_num = vect_get_num_vectors
6268           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6269            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6270            vectype_in);
6271       else
6272         vec_num = 1;
6273
6274       /* Generate the reduction PHIs upfront.  */
6275       prev_phi_info = NULL;
6276       for (j = 0; j < ncopies; j++)
6277         {
6278           if (j == 0 || !single_defuse_cycle)
6279             {
6280               for (i = 0; i < vec_num; i++)
6281                 {
6282                   /* Create the reduction-phi that defines the reduction
6283                      operand.  */
6284                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6285                   set_vinfo_for_stmt (new_phi,
6286                                       new_stmt_vec_info (new_phi, loop_vinfo));
6287
6288                   if (slp_node)
6289                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6290                   else
6291                     {
6292                       if (j == 0)
6293                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6294                       else
6295                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6296                       prev_phi_info = vinfo_for_stmt (new_phi);
6297                     }
6298                 }
6299             }
6300         }
6301
6302       return true;
6303     }
6304
6305   /* 1. Is vectorizable reduction?  */
6306   /* Not supportable if the reduction variable is used in the loop, unless
6307      it's a reduction chain.  */
6308   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6309       && !GROUP_FIRST_ELEMENT (stmt_info))
6310     return false;
6311
6312   /* Reductions that are not used even in an enclosing outer-loop,
6313      are expected to be "live" (used out of the loop).  */
6314   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6315       && !STMT_VINFO_LIVE_P (stmt_info))
6316     return false;
6317
6318   /* 2. Has this been recognized as a reduction pattern?
6319
6320      Check if STMT represents a pattern that has been recognized
6321      in earlier analysis stages.  For stmts that represent a pattern,
6322      the STMT_VINFO_RELATED_STMT field records the last stmt in
6323      the original sequence that constitutes the pattern.  */
6324
6325   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6326   if (orig_stmt)
6327     {
6328       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6329       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6330       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6331     }
6332
6333   /* 3. Check the operands of the operation.  The first operands are defined
6334         inside the loop body. The last operand is the reduction variable,
6335         which is defined by the loop-header-phi.  */
6336
6337   gcc_assert (is_gimple_assign (stmt));
6338
6339   /* Flatten RHS.  */
6340   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6341     {
6342     case GIMPLE_BINARY_RHS:
6343       code = gimple_assign_rhs_code (stmt);
6344       op_type = TREE_CODE_LENGTH (code);
6345       gcc_assert (op_type == binary_op);
6346       ops[0] = gimple_assign_rhs1 (stmt);
6347       ops[1] = gimple_assign_rhs2 (stmt);
6348       break;
6349
6350     case GIMPLE_TERNARY_RHS:
6351       code = gimple_assign_rhs_code (stmt);
6352       op_type = TREE_CODE_LENGTH (code);
6353       gcc_assert (op_type == ternary_op);
6354       ops[0] = gimple_assign_rhs1 (stmt);
6355       ops[1] = gimple_assign_rhs2 (stmt);
6356       ops[2] = gimple_assign_rhs3 (stmt);
6357       break;
6358
6359     case GIMPLE_UNARY_RHS:
6360       return false;
6361
6362     default:
6363       gcc_unreachable ();
6364     }
6365
6366   if (code == COND_EXPR && slp_node)
6367     return false;
6368
6369   scalar_dest = gimple_assign_lhs (stmt);
6370   scalar_type = TREE_TYPE (scalar_dest);
6371   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6372       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6373     return false;
6374
6375   /* Do not try to vectorize bit-precision reductions.  */
6376   if (!type_has_mode_precision_p (scalar_type))
6377     return false;
6378
6379   /* All uses but the last are expected to be defined in the loop.
6380      The last use is the reduction variable.  In case of nested cycle this
6381      assumption is not true: we use reduc_index to record the index of the
6382      reduction variable.  */
6383   gimple *reduc_def_stmt = NULL;
6384   int reduc_index = -1;
6385   for (i = 0; i < op_type; i++)
6386     {
6387       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6388       if (i == 0 && code == COND_EXPR)
6389         continue;
6390
6391       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6392                                           &def_stmt, &dts[i], &tem);
6393       dt = dts[i];
6394       gcc_assert (is_simple_use);
6395       if (dt == vect_reduction_def)
6396         {
6397           reduc_def_stmt = def_stmt;
6398           reduc_index = i;
6399           continue;
6400         }
6401       else if (tem)
6402         {
6403           /* To properly compute ncopies we are interested in the widest
6404              input type in case we're looking at a widening accumulation.  */
6405           if (!vectype_in
6406               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6407                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6408             vectype_in = tem;
6409         }
6410
6411       if (dt != vect_internal_def
6412           && dt != vect_external_def
6413           && dt != vect_constant_def
6414           && dt != vect_induction_def
6415           && !(dt == vect_nested_cycle && nested_cycle))
6416         return false;
6417
6418       if (dt == vect_nested_cycle)
6419         {
6420           found_nested_cycle_def = true;
6421           reduc_def_stmt = def_stmt;
6422           reduc_index = i;
6423         }
6424
6425       if (i == 1 && code == COND_EXPR)
6426         {
6427           /* Record how value of COND_EXPR is defined.  */
6428           if (dt == vect_constant_def)
6429             {
6430               cond_reduc_dt = dt;
6431               cond_reduc_val = ops[i];
6432             }
6433           if (dt == vect_induction_def
6434               && def_stmt != NULL
6435               && is_nonwrapping_integer_induction (def_stmt, loop))
6436             {
6437               cond_reduc_dt = dt;
6438               cond_reduc_def_stmt = def_stmt;
6439             }
6440         }
6441     }
6442
6443   if (!vectype_in)
6444     vectype_in = vectype_out;
6445
6446   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6447      directy used in stmt.  */
6448   if (reduc_index == -1)
6449     {
6450       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6451         {
6452           if (dump_enabled_p ())
6453             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6454                              "in-order reduction chain without SLP.\n");
6455           return false;
6456         }
6457
6458       if (orig_stmt)
6459         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6460       else
6461         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6462     }
6463
6464   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6465     return false;
6466
6467   if (!(reduc_index == -1
6468         || dts[reduc_index] == vect_reduction_def
6469         || dts[reduc_index] == vect_nested_cycle
6470         || ((dts[reduc_index] == vect_internal_def
6471              || dts[reduc_index] == vect_external_def
6472              || dts[reduc_index] == vect_constant_def
6473              || dts[reduc_index] == vect_induction_def)
6474             && nested_cycle && found_nested_cycle_def)))
6475     {
6476       /* For pattern recognized stmts, orig_stmt might be a reduction,
6477          but some helper statements for the pattern might not, or
6478          might be COND_EXPRs with reduction uses in the condition.  */
6479       gcc_assert (orig_stmt);
6480       return false;
6481     }
6482
6483   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6484   enum vect_reduction_type v_reduc_type
6485     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6486   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6487
6488   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6489   /* If we have a condition reduction, see if we can simplify it further.  */
6490   if (v_reduc_type == COND_REDUCTION)
6491     {
6492       /* TODO: We can't yet handle reduction chains, since we need to treat
6493          each COND_EXPR in the chain specially, not just the last one.
6494          E.g. for:
6495
6496             x_1 = PHI <x_3, ...>
6497             x_2 = a_2 ? ... : x_1;
6498             x_3 = a_3 ? ... : x_2;
6499
6500          we're interested in the last element in x_3 for which a_2 || a_3
6501          is true, whereas the current reduction chain handling would
6502          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6503          as a reduction operation.  */
6504       if (reduc_index == -1)
6505         {
6506           if (dump_enabled_p ())
6507             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6508                              "conditional reduction chains not supported\n");
6509           return false;
6510         }
6511
6512       /* vect_is_simple_reduction ensured that operand 2 is the
6513          loop-carried operand.  */
6514       gcc_assert (reduc_index == 2);
6515
6516       /* Loop peeling modifies initial value of reduction PHI, which
6517          makes the reduction stmt to be transformed different to the
6518          original stmt analyzed.  We need to record reduction code for
6519          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6520          it can be used directly at transform stage.  */
6521       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6522           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6523         {
6524           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6525           gcc_assert (cond_reduc_dt == vect_constant_def);
6526           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6527         }
6528       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6529                                                vectype_in, OPTIMIZE_FOR_SPEED))
6530         {
6531           if (dump_enabled_p ())
6532             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6533                              "optimizing condition reduction with"
6534                              " FOLD_EXTRACT_LAST.\n");
6535           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6536         }
6537       else if (cond_reduc_dt == vect_induction_def)
6538         {
6539           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6540           tree base
6541             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6542           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6543
6544           gcc_assert (TREE_CODE (base) == INTEGER_CST
6545                       && TREE_CODE (step) == INTEGER_CST);
6546           cond_reduc_val = NULL_TREE;
6547           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6548              above base; punt if base is the minimum value of the type for
6549              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6550           if (tree_int_cst_sgn (step) == -1)
6551             {
6552               cond_reduc_op_code = MIN_EXPR;
6553               if (tree_int_cst_sgn (base) == -1)
6554                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6555               else if (tree_int_cst_lt (base,
6556                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6557                 cond_reduc_val
6558                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6559             }
6560           else
6561             {
6562               cond_reduc_op_code = MAX_EXPR;
6563               if (tree_int_cst_sgn (base) == 1)
6564                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6565               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6566                                         base))
6567                 cond_reduc_val
6568                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6569             }
6570           if (cond_reduc_val)
6571             {
6572               if (dump_enabled_p ())
6573                 dump_printf_loc (MSG_NOTE, vect_location,
6574                                  "condition expression based on "
6575                                  "integer induction.\n");
6576               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6577                 = INTEGER_INDUC_COND_REDUCTION;
6578             }
6579         }
6580       else if (cond_reduc_dt == vect_constant_def)
6581         {
6582           enum vect_def_type cond_initial_dt;
6583           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6584           tree cond_initial_val
6585             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6586
6587           gcc_assert (cond_reduc_val != NULL_TREE);
6588           vect_is_simple_use (cond_initial_val, loop_vinfo,
6589                               &def_stmt, &cond_initial_dt);
6590           if (cond_initial_dt == vect_constant_def
6591               && types_compatible_p (TREE_TYPE (cond_initial_val),
6592                                      TREE_TYPE (cond_reduc_val)))
6593             {
6594               tree e = fold_binary (LE_EXPR, boolean_type_node,
6595                                     cond_initial_val, cond_reduc_val);
6596               if (e && (integer_onep (e) || integer_zerop (e)))
6597                 {
6598                   if (dump_enabled_p ())
6599                     dump_printf_loc (MSG_NOTE, vect_location,
6600                                      "condition expression based on "
6601                                      "compile time constant.\n");
6602                   /* Record reduction code at analysis stage.  */
6603                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6604                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6605                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6606                     = CONST_COND_REDUCTION;
6607                 }
6608             }
6609         }
6610     }
6611
6612   if (orig_stmt)
6613     gcc_assert (tmp == orig_stmt
6614                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6615   else
6616     /* We changed STMT to be the first stmt in reduction chain, hence we
6617        check that in this case the first element in the chain is STMT.  */
6618     gcc_assert (stmt == tmp
6619                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6620
6621   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6622     return false;
6623
6624   if (slp_node)
6625     ncopies = 1;
6626   else
6627     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6628
6629   gcc_assert (ncopies >= 1);
6630
6631   vec_mode = TYPE_MODE (vectype_in);
6632   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6633
6634   if (code == COND_EXPR)
6635     {
6636       /* Only call during the analysis stage, otherwise we'll lose
6637          STMT_VINFO_TYPE.  */
6638       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6639                                                 ops[reduc_index], 0, NULL,
6640                                                 cost_vec))
6641         {
6642           if (dump_enabled_p ())
6643             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6644                              "unsupported condition in reduction\n");
6645           return false;
6646         }
6647     }
6648   else
6649     {
6650       /* 4. Supportable by target?  */
6651
6652       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6653           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6654         {
6655           /* Shifts and rotates are only supported by vectorizable_shifts,
6656              not vectorizable_reduction.  */
6657           if (dump_enabled_p ())
6658             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6659                              "unsupported shift or rotation.\n");
6660           return false;
6661         }
6662
6663       /* 4.1. check support for the operation in the loop  */
6664       optab = optab_for_tree_code (code, vectype_in, optab_default);
6665       if (!optab)
6666         {
6667           if (dump_enabled_p ())
6668             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6669                              "no optab.\n");
6670
6671           return false;
6672         }
6673
6674       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6675         {
6676           if (dump_enabled_p ())
6677             dump_printf (MSG_NOTE, "op not supported by target.\n");
6678
6679           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6680               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6681             return false;
6682
6683           if (dump_enabled_p ())
6684             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6685         }
6686
6687       /* Worthwhile without SIMD support?  */
6688       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6689           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6690         {
6691           if (dump_enabled_p ())
6692             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6693                              "not worthwhile without SIMD support.\n");
6694
6695           return false;
6696         }
6697     }
6698
6699   /* 4.2. Check support for the epilog operation.
6700
6701           If STMT represents a reduction pattern, then the type of the
6702           reduction variable may be different than the type of the rest
6703           of the arguments.  For example, consider the case of accumulation
6704           of shorts into an int accumulator; The original code:
6705                         S1: int_a = (int) short_a;
6706           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6707
6708           was replaced with:
6709                         STMT: int_acc = widen_sum <short_a, int_acc>
6710
6711           This means that:
6712           1. The tree-code that is used to create the vector operation in the
6713              epilog code (that reduces the partial results) is not the
6714              tree-code of STMT, but is rather the tree-code of the original
6715              stmt from the pattern that STMT is replacing.  I.e, in the example
6716              above we want to use 'widen_sum' in the loop, but 'plus' in the
6717              epilog.
6718           2. The type (mode) we use to check available target support
6719              for the vector operation to be created in the *epilog*, is
6720              determined by the type of the reduction variable (in the example
6721              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6722              However the type (mode) we use to check available target support
6723              for the vector operation to be created *inside the loop*, is
6724              determined by the type of the other arguments to STMT (in the
6725              example we'd check this: optab_handler (widen_sum_optab,
6726              vect_short_mode)).
6727
6728           This is contrary to "regular" reductions, in which the types of all
6729           the arguments are the same as the type of the reduction variable.
6730           For "regular" reductions we can therefore use the same vector type
6731           (and also the same tree-code) when generating the epilog code and
6732           when generating the code inside the loop.  */
6733
6734   vect_reduction_type reduction_type
6735     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6736   if (orig_stmt
6737       && (reduction_type == TREE_CODE_REDUCTION
6738           || reduction_type == FOLD_LEFT_REDUCTION))
6739     {
6740       /* This is a reduction pattern: get the vectype from the type of the
6741          reduction variable, and get the tree-code from orig_stmt.  */
6742       orig_code = gimple_assign_rhs_code (orig_stmt);
6743       gcc_assert (vectype_out);
6744       vec_mode = TYPE_MODE (vectype_out);
6745     }
6746   else
6747     {
6748       /* Regular reduction: use the same vectype and tree-code as used for
6749          the vector code inside the loop can be used for the epilog code. */
6750       orig_code = code;
6751
6752       if (code == MINUS_EXPR)
6753         orig_code = PLUS_EXPR;
6754
6755       /* For simple condition reductions, replace with the actual expression
6756          we want to base our reduction around.  */
6757       if (reduction_type == CONST_COND_REDUCTION)
6758         {
6759           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6760           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6761         }
6762       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6763         orig_code = cond_reduc_op_code;
6764     }
6765
6766   if (nested_cycle)
6767     {
6768       def_bb = gimple_bb (reduc_def_stmt);
6769       def_stmt_loop = def_bb->loop_father;
6770       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6771                                        loop_preheader_edge (def_stmt_loop));
6772       if (TREE_CODE (def_arg) == SSA_NAME
6773           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6774           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6775           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6776           && vinfo_for_stmt (def_arg_stmt)
6777           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6778               == vect_double_reduction_def)
6779         double_reduc = true;
6780     }
6781
6782   reduc_fn = IFN_LAST;
6783
6784   if (reduction_type == TREE_CODE_REDUCTION
6785       || reduction_type == FOLD_LEFT_REDUCTION
6786       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6787       || reduction_type == CONST_COND_REDUCTION)
6788     {
6789       if (reduction_type == FOLD_LEFT_REDUCTION
6790           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6791           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6792         {
6793           if (reduc_fn != IFN_LAST
6794               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6795                                                   OPTIMIZE_FOR_SPEED))
6796             {
6797               if (dump_enabled_p ())
6798                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799                                  "reduc op not supported by target.\n");
6800
6801               reduc_fn = IFN_LAST;
6802             }
6803         }
6804       else
6805         {
6806           if (!nested_cycle || double_reduc)
6807             {
6808               if (dump_enabled_p ())
6809                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810                                  "no reduc code for scalar code.\n");
6811
6812               return false;
6813             }
6814         }
6815     }
6816   else if (reduction_type == COND_REDUCTION)
6817     {
6818       int scalar_precision
6819         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6820       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6821       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6822                                                 nunits_out);
6823
6824       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6825                                           OPTIMIZE_FOR_SPEED))
6826         reduc_fn = IFN_REDUC_MAX;
6827     }
6828
6829   if (reduction_type != EXTRACT_LAST_REDUCTION
6830       && reduc_fn == IFN_LAST
6831       && !nunits_out.is_constant ())
6832     {
6833       if (dump_enabled_p ())
6834         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6835                          "missing target support for reduction on"
6836                          " variable-length vectors.\n");
6837       return false;
6838     }
6839
6840   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6841       && ncopies > 1)
6842     {
6843       if (dump_enabled_p ())
6844         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6845                          "multiple types in double reduction or condition "
6846                          "reduction.\n");
6847       return false;
6848     }
6849
6850   /* For SLP reductions, see if there is a neutral value we can use.  */
6851   tree neutral_op = NULL_TREE;
6852   if (slp_node)
6853     neutral_op
6854       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
6855                                       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6856
6857   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6858     {
6859       /* We can't support in-order reductions of code such as this:
6860
6861            for (int i = 0; i < n1; ++i)
6862              for (int j = 0; j < n2; ++j)
6863                l += a[j];
6864
6865          since GCC effectively transforms the loop when vectorizing:
6866
6867            for (int i = 0; i < n1 / VF; ++i)
6868              for (int j = 0; j < n2; ++j)
6869                for (int k = 0; k < VF; ++k)
6870                  l += a[j];
6871
6872          which is a reassociation of the original operation.  */
6873       if (dump_enabled_p ())
6874         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6875                          "in-order double reduction not supported.\n");
6876
6877       return false;
6878     }
6879
6880   if (reduction_type == FOLD_LEFT_REDUCTION
6881       && slp_node
6882       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6883     {
6884       /* We cannot use in-order reductions in this case because there is
6885          an implicit reassociation of the operations involved.  */
6886       if (dump_enabled_p ())
6887         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6888                          "in-order unchained SLP reductions not supported.\n");
6889       return false;
6890     }
6891
6892   /* For double reductions, and for SLP reductions with a neutral value,
6893      we construct a variable-length initial vector by loading a vector
6894      full of the neutral value and then shift-and-inserting the start
6895      values into the low-numbered elements.  */
6896   if ((double_reduc || neutral_op)
6897       && !nunits_out.is_constant ()
6898       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6899                                           vectype_out, OPTIMIZE_FOR_SPEED))
6900     {
6901       if (dump_enabled_p ())
6902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6903                          "reduction on variable-length vectors requires"
6904                          " target support for a vector-shift-and-insert"
6905                          " operation.\n");
6906       return false;
6907     }
6908
6909   /* Check extra constraints for variable-length unchained SLP reductions.  */
6910   if (STMT_SLP_TYPE (stmt_info)
6911       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6912       && !nunits_out.is_constant ())
6913     {
6914       /* We checked above that we could build the initial vector when
6915          there's a neutral element value.  Check here for the case in
6916          which each SLP statement has its own initial value and in which
6917          that value needs to be repeated for every instance of the
6918          statement within the initial vector.  */
6919       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6920       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6921       if (!neutral_op
6922           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6923         {
6924           if (dump_enabled_p ())
6925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6926                              "unsupported form of SLP reduction for"
6927                              " variable-length vectors: cannot build"
6928                              " initial vector.\n");
6929           return false;
6930         }
6931       /* The epilogue code relies on the number of elements being a multiple
6932          of the group size.  The duplicate-and-interleave approach to setting
6933          up the the initial vector does too.  */
6934       if (!multiple_p (nunits_out, group_size))
6935         {
6936           if (dump_enabled_p ())
6937             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6938                              "unsupported form of SLP reduction for"
6939                              " variable-length vectors: the vector size"
6940                              " is not a multiple of the number of results.\n");
6941           return false;
6942         }
6943     }
6944
6945   /* In case of widenning multiplication by a constant, we update the type
6946      of the constant to be the type of the other operand.  We check that the
6947      constant fits the type in the pattern recognition pass.  */
6948   if (code == DOT_PROD_EXPR
6949       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6950     {
6951       if (TREE_CODE (ops[0]) == INTEGER_CST)
6952         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6953       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6954         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6955       else
6956         {
6957           if (dump_enabled_p ())
6958             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6959                              "invalid types in dot-prod\n");
6960
6961           return false;
6962         }
6963     }
6964
6965   if (reduction_type == COND_REDUCTION)
6966     {
6967       widest_int ni;
6968
6969       if (! max_loop_iterations (loop, &ni))
6970         {
6971           if (dump_enabled_p ())
6972             dump_printf_loc (MSG_NOTE, vect_location,
6973                              "loop count not known, cannot create cond "
6974                              "reduction.\n");
6975           return false;
6976         }
6977       /* Convert backedges to iterations.  */
6978       ni += 1;
6979
6980       /* The additional index will be the same type as the condition.  Check
6981          that the loop can fit into this less one (because we'll use up the
6982          zero slot for when there are no matches).  */
6983       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6984       if (wi::geu_p (ni, wi::to_widest (max_index)))
6985         {
6986           if (dump_enabled_p ())
6987             dump_printf_loc (MSG_NOTE, vect_location,
6988                              "loop size is greater than data size.\n");
6989           return false;
6990         }
6991     }
6992
6993   /* In case the vectorization factor (VF) is bigger than the number
6994      of elements that we can fit in a vectype (nunits), we have to generate
6995      more than one vector stmt - i.e - we need to "unroll" the
6996      vector stmt by a factor VF/nunits.  For more details see documentation
6997      in vectorizable_operation.  */
6998
6999   /* If the reduction is used in an outer loop we need to generate
7000      VF intermediate results, like so (e.g. for ncopies=2):
7001         r0 = phi (init, r0)
7002         r1 = phi (init, r1)
7003         r0 = x0 + r0;
7004         r1 = x1 + r1;
7005     (i.e. we generate VF results in 2 registers).
7006     In this case we have a separate def-use cycle for each copy, and therefore
7007     for each copy we get the vector def for the reduction variable from the
7008     respective phi node created for this copy.
7009
7010     Otherwise (the reduction is unused in the loop nest), we can combine
7011     together intermediate results, like so (e.g. for ncopies=2):
7012         r = phi (init, r)
7013         r = x0 + r;
7014         r = x1 + r;
7015    (i.e. we generate VF/2 results in a single register).
7016    In this case for each copy we get the vector def for the reduction variable
7017    from the vectorized reduction operation generated in the previous iteration.
7018
7019    This only works when we see both the reduction PHI and its only consumer
7020    in vectorizable_reduction and there are no intermediate stmts
7021    participating.  */
7022   use_operand_p use_p;
7023   gimple *use_stmt;
7024   if (ncopies > 1
7025       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7026       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7027       && (use_stmt == stmt
7028           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7029     {
7030       single_defuse_cycle = true;
7031       epilog_copies = 1;
7032     }
7033   else
7034     epilog_copies = ncopies;
7035
7036   /* If the reduction stmt is one of the patterns that have lane
7037      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7038   if ((ncopies > 1
7039        && ! single_defuse_cycle)
7040       && (code == DOT_PROD_EXPR
7041           || code == WIDEN_SUM_EXPR
7042           || code == SAD_EXPR))
7043     {
7044       if (dump_enabled_p ())
7045         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7046                          "multi def-use cycle not possible for lane-reducing "
7047                          "reduction operation\n");
7048       return false;
7049     }
7050
7051   if (slp_node)
7052     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7053   else
7054     vec_num = 1;
7055
7056   internal_fn cond_fn = get_conditional_internal_fn (code);
7057   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7058
7059   if (!vec_stmt) /* transformation not required.  */
7060     {
7061       if (first_p)
7062         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7063       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7064         {
7065           if (reduction_type != FOLD_LEFT_REDUCTION
7066               && (cond_fn == IFN_LAST
7067                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7068                                                       OPTIMIZE_FOR_SPEED)))
7069             {
7070               if (dump_enabled_p ())
7071                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7072                                  "can't use a fully-masked loop because no"
7073                                  " conditional operation is available.\n");
7074               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7075             }
7076           else if (reduc_index == -1)
7077             {
7078               if (dump_enabled_p ())
7079                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7080                                  "can't use a fully-masked loop for chained"
7081                                  " reductions.\n");
7082               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7083             }
7084           else
7085             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7086                                    vectype_in);
7087         }
7088       if (dump_enabled_p ()
7089           && reduction_type == FOLD_LEFT_REDUCTION)
7090         dump_printf_loc (MSG_NOTE, vect_location,
7091                          "using an in-order (fold-left) reduction.\n");
7092       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7093       return true;
7094     }
7095
7096   /* Transform.  */
7097
7098   if (dump_enabled_p ())
7099     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7100
7101   /* FORNOW: Multiple types are not supported for condition.  */
7102   if (code == COND_EXPR)
7103     gcc_assert (ncopies == 1);
7104
7105   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7106
7107   if (reduction_type == FOLD_LEFT_REDUCTION)
7108     return vectorize_fold_left_reduction
7109       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7110        reduc_fn, ops, vectype_in, reduc_index, masks);
7111
7112   if (reduction_type == EXTRACT_LAST_REDUCTION)
7113     {
7114       gcc_assert (!slp_node);
7115       return vectorizable_condition (stmt, gsi, vec_stmt,
7116                                      NULL, reduc_index, NULL, NULL);
7117     }
7118
7119   /* Create the destination vector  */
7120   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7121
7122   prev_stmt_info = NULL;
7123   prev_phi_info = NULL;
7124   if (!slp_node)
7125     {
7126       vec_oprnds0.create (1);
7127       vec_oprnds1.create (1);
7128       if (op_type == ternary_op)
7129         vec_oprnds2.create (1);
7130     }
7131
7132   phis.create (vec_num);
7133   vect_defs.create (vec_num);
7134   if (!slp_node)
7135     vect_defs.quick_push (NULL_TREE);
7136
7137   if (slp_node)
7138     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7139   else
7140     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7141
7142   for (j = 0; j < ncopies; j++)
7143     {
7144       if (code == COND_EXPR)
7145         {
7146           gcc_assert (!slp_node);
7147           vectorizable_condition (stmt, gsi, vec_stmt,
7148                                   PHI_RESULT (phis[0]),
7149                                   reduc_index, NULL, NULL);
7150           /* Multiple types are not supported for condition.  */
7151           break;
7152         }
7153
7154       /* Handle uses.  */
7155       if (j == 0)
7156         {
7157           if (slp_node)
7158             {
7159               /* Get vec defs for all the operands except the reduction index,
7160                  ensuring the ordering of the ops in the vector is kept.  */
7161               auto_vec<tree, 3> slp_ops;
7162               auto_vec<vec<tree>, 3> vec_defs;
7163
7164               slp_ops.quick_push (ops[0]);
7165               slp_ops.quick_push (ops[1]);
7166               if (op_type == ternary_op)
7167                 slp_ops.quick_push (ops[2]);
7168
7169               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7170
7171               vec_oprnds0.safe_splice (vec_defs[0]);
7172               vec_defs[0].release ();
7173               vec_oprnds1.safe_splice (vec_defs[1]);
7174               vec_defs[1].release ();
7175               if (op_type == ternary_op)
7176                 {
7177                   vec_oprnds2.safe_splice (vec_defs[2]);
7178                   vec_defs[2].release ();
7179                 }
7180             }
7181           else
7182             {
7183               vec_oprnds0.quick_push
7184                 (vect_get_vec_def_for_operand (ops[0], stmt));
7185               vec_oprnds1.quick_push
7186                 (vect_get_vec_def_for_operand (ops[1], stmt));
7187               if (op_type == ternary_op)
7188                 vec_oprnds2.quick_push
7189                   (vect_get_vec_def_for_operand (ops[2], stmt));
7190             }
7191         }
7192       else
7193         {
7194           if (!slp_node)
7195             {
7196               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7197
7198               if (single_defuse_cycle && reduc_index == 0)
7199                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7200               else
7201                 vec_oprnds0[0]
7202                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7203               if (single_defuse_cycle && reduc_index == 1)
7204                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7205               else
7206                 vec_oprnds1[0]
7207                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7208               if (op_type == ternary_op)
7209                 {
7210                   if (single_defuse_cycle && reduc_index == 2)
7211                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7212                   else
7213                     vec_oprnds2[0]
7214                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7215                 }
7216             }
7217         }
7218
7219       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7220         {
7221           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7222           if (masked_loop_p)
7223             {
7224               /* Make sure that the reduction accumulator is vop[0].  */
7225               if (reduc_index == 1)
7226                 {
7227                   gcc_assert (commutative_tree_code (code));
7228                   std::swap (vop[0], vop[1]);
7229                 }
7230               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7231                                               vectype_in, i * ncopies + j);
7232               gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7233                                                         vop[0], vop[1]);
7234               new_temp = make_ssa_name (vec_dest, call);
7235               gimple_call_set_lhs (call, new_temp);
7236               gimple_call_set_nothrow (call, true);
7237               new_stmt = call;
7238             }
7239           else
7240             {
7241               if (op_type == ternary_op)
7242                 vop[2] = vec_oprnds2[i];
7243
7244               new_temp = make_ssa_name (vec_dest, new_stmt);
7245               new_stmt = gimple_build_assign (new_temp, code,
7246                                               vop[0], vop[1], vop[2]);
7247             }
7248           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7249
7250           if (slp_node)
7251             {
7252               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7253               vect_defs.quick_push (new_temp);
7254             }
7255           else
7256             vect_defs[0] = new_temp;
7257         }
7258
7259       if (slp_node)
7260         continue;
7261
7262       if (j == 0)
7263         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7264       else
7265         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7266
7267       prev_stmt_info = vinfo_for_stmt (new_stmt);
7268     }
7269
7270   /* Finalize the reduction-phi (set its arguments) and create the
7271      epilog reduction code.  */
7272   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7273     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7274
7275   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7276                                     epilog_copies, reduc_fn, phis,
7277                                     double_reduc, slp_node, slp_node_instance,
7278                                     cond_reduc_val, cond_reduc_op_code,
7279                                     neutral_op);
7280
7281   return true;
7282 }
7283
7284 /* Function vect_min_worthwhile_factor.
7285
7286    For a loop where we could vectorize the operation indicated by CODE,
7287    return the minimum vectorization factor that makes it worthwhile
7288    to use generic vectors.  */
7289 static unsigned int
7290 vect_min_worthwhile_factor (enum tree_code code)
7291 {
7292   switch (code)
7293     {
7294     case PLUS_EXPR:
7295     case MINUS_EXPR:
7296     case NEGATE_EXPR:
7297       return 4;
7298
7299     case BIT_AND_EXPR:
7300     case BIT_IOR_EXPR:
7301     case BIT_XOR_EXPR:
7302     case BIT_NOT_EXPR:
7303       return 2;
7304
7305     default:
7306       return INT_MAX;
7307     }
7308 }
7309
7310 /* Return true if VINFO indicates we are doing loop vectorization and if
7311    it is worth decomposing CODE operations into scalar operations for
7312    that loop's vectorization factor.  */
7313
7314 bool
7315 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7316 {
7317   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7318   unsigned HOST_WIDE_INT value;
7319   return (loop_vinfo
7320           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7321           && value >= vect_min_worthwhile_factor (code));
7322 }
7323
7324 /* Function vectorizable_induction
7325
7326    Check if PHI performs an induction computation that can be vectorized.
7327    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7328    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7329    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7330
7331 bool
7332 vectorizable_induction (gimple *phi,
7333                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7334                         gimple **vec_stmt, slp_tree slp_node,
7335                         stmt_vector_for_cost *cost_vec)
7336 {
7337   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7338   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7339   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7340   unsigned ncopies;
7341   bool nested_in_vect_loop = false;
7342   struct loop *iv_loop;
7343   tree vec_def;
7344   edge pe = loop_preheader_edge (loop);
7345   basic_block new_bb;
7346   tree new_vec, vec_init, vec_step, t;
7347   tree new_name;
7348   gimple *new_stmt;
7349   gphi *induction_phi;
7350   tree induc_def, vec_dest;
7351   tree init_expr, step_expr;
7352   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7353   unsigned i;
7354   tree expr;
7355   gimple_seq stmts;
7356   imm_use_iterator imm_iter;
7357   use_operand_p use_p;
7358   gimple *exit_phi;
7359   edge latch_e;
7360   tree loop_arg;
7361   gimple_stmt_iterator si;
7362   basic_block bb = gimple_bb (phi);
7363
7364   if (gimple_code (phi) != GIMPLE_PHI)
7365     return false;
7366
7367   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7368     return false;
7369
7370   /* Make sure it was recognized as induction computation.  */
7371   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7372     return false;
7373
7374   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7375   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7376
7377   if (slp_node)
7378     ncopies = 1;
7379   else
7380     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7381   gcc_assert (ncopies >= 1);
7382
7383   /* FORNOW. These restrictions should be relaxed.  */
7384   if (nested_in_vect_loop_p (loop, phi))
7385     {
7386       imm_use_iterator imm_iter;
7387       use_operand_p use_p;
7388       gimple *exit_phi;
7389       edge latch_e;
7390       tree loop_arg;
7391
7392       if (ncopies > 1)
7393         {
7394           if (dump_enabled_p ())
7395             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7396                              "multiple types in nested loop.\n");
7397           return false;
7398         }
7399
7400       /* FORNOW: outer loop induction with SLP not supported.  */
7401       if (STMT_SLP_TYPE (stmt_info))
7402         return false;
7403
7404       exit_phi = NULL;
7405       latch_e = loop_latch_edge (loop->inner);
7406       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7407       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7408         {
7409           gimple *use_stmt = USE_STMT (use_p);
7410           if (is_gimple_debug (use_stmt))
7411             continue;
7412
7413           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7414             {
7415               exit_phi = use_stmt;
7416               break;
7417             }
7418         }
7419       if (exit_phi)
7420         {
7421           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7422           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7423                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7424             {
7425               if (dump_enabled_p ())
7426                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7427                                  "inner-loop induction only used outside "
7428                                  "of the outer vectorized loop.\n");
7429               return false;
7430             }
7431         }
7432
7433       nested_in_vect_loop = true;
7434       iv_loop = loop->inner;
7435     }
7436   else
7437     iv_loop = loop;
7438   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7439
7440   if (slp_node && !nunits.is_constant ())
7441     {
7442       /* The current SLP code creates the initial value element-by-element.  */
7443       if (dump_enabled_p ())
7444         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7445                          "SLP induction not supported for variable-length"
7446                          " vectors.\n");
7447       return false;
7448     }
7449
7450   if (!vec_stmt) /* transformation not required.  */
7451     {
7452       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7453       if (dump_enabled_p ())
7454         dump_printf_loc (MSG_NOTE, vect_location,
7455                          "=== vectorizable_induction ===\n");
7456       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7457       return true;
7458     }
7459
7460   /* Transform.  */
7461
7462   /* Compute a vector variable, initialized with the first VF values of
7463      the induction variable.  E.g., for an iv with IV_PHI='X' and
7464      evolution S, for a vector of 4 units, we want to compute:
7465      [X, X + S, X + 2*S, X + 3*S].  */
7466
7467   if (dump_enabled_p ())
7468     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7469
7470   latch_e = loop_latch_edge (iv_loop);
7471   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7472
7473   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7474   gcc_assert (step_expr != NULL_TREE);
7475
7476   pe = loop_preheader_edge (iv_loop);
7477   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7478                                      loop_preheader_edge (iv_loop));
7479
7480   stmts = NULL;
7481   if (!nested_in_vect_loop)
7482     {
7483       /* Convert the initial value to the desired type.  */
7484       tree new_type = TREE_TYPE (vectype);
7485       init_expr = gimple_convert (&stmts, new_type, init_expr);
7486
7487       /* If we are using the loop mask to "peel" for alignment then we need
7488          to adjust the start value here.  */
7489       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7490       if (skip_niters != NULL_TREE)
7491         {
7492           if (FLOAT_TYPE_P (vectype))
7493             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7494                                         skip_niters);
7495           else
7496             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7497           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7498                                          skip_niters, step_expr);
7499           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7500                                     init_expr, skip_step);
7501         }
7502     }
7503
7504   /* Convert the step to the desired type.  */
7505   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7506
7507   if (stmts)
7508     {
7509       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7510       gcc_assert (!new_bb);
7511     }
7512
7513   /* Find the first insertion point in the BB.  */
7514   si = gsi_after_labels (bb);
7515
7516   /* For SLP induction we have to generate several IVs as for example
7517      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7518      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7519      [VF*S, VF*S, VF*S, VF*S] for all.  */
7520   if (slp_node)
7521     {
7522       /* Enforced above.  */
7523       unsigned int const_nunits = nunits.to_constant ();
7524
7525       /* Generate [VF*S, VF*S, ... ].  */
7526       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7527         {
7528           expr = build_int_cst (integer_type_node, vf);
7529           expr = fold_convert (TREE_TYPE (step_expr), expr);
7530         }
7531       else
7532         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7533       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7534                               expr, step_expr);
7535       if (! CONSTANT_CLASS_P (new_name))
7536         new_name = vect_init_vector (phi, new_name,
7537                                      TREE_TYPE (step_expr), NULL);
7538       new_vec = build_vector_from_val (vectype, new_name);
7539       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7540
7541       /* Now generate the IVs.  */
7542       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7543       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7544       unsigned elts = const_nunits * nvects;
7545       unsigned nivs = least_common_multiple (group_size,
7546                                              const_nunits) / const_nunits;
7547       gcc_assert (elts % group_size == 0);
7548       tree elt = init_expr;
7549       unsigned ivn;
7550       for (ivn = 0; ivn < nivs; ++ivn)
7551         {
7552           tree_vector_builder elts (vectype, const_nunits, 1);
7553           stmts = NULL;
7554           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7555             {
7556               if (ivn*const_nunits + eltn >= group_size
7557                   && (ivn * const_nunits + eltn) % group_size == 0)
7558                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7559                                     elt, step_expr);
7560               elts.quick_push (elt);
7561             }
7562           vec_init = gimple_build_vector (&stmts, &elts);
7563           if (stmts)
7564             {
7565               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7566               gcc_assert (!new_bb);
7567             }
7568
7569           /* Create the induction-phi that defines the induction-operand.  */
7570           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7571           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7572           set_vinfo_for_stmt (induction_phi,
7573                               new_stmt_vec_info (induction_phi, loop_vinfo));
7574           induc_def = PHI_RESULT (induction_phi);
7575
7576           /* Create the iv update inside the loop  */
7577           vec_def = make_ssa_name (vec_dest);
7578           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7579           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7580           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7581
7582           /* Set the arguments of the phi node:  */
7583           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7584           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7585                        UNKNOWN_LOCATION);
7586
7587           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7588         }
7589
7590       /* Re-use IVs when we can.  */
7591       if (ivn < nvects)
7592         {
7593           unsigned vfp
7594             = least_common_multiple (group_size, const_nunits) / group_size;
7595           /* Generate [VF'*S, VF'*S, ... ].  */
7596           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7597             {
7598               expr = build_int_cst (integer_type_node, vfp);
7599               expr = fold_convert (TREE_TYPE (step_expr), expr);
7600             }
7601           else
7602             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7603           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7604                                   expr, step_expr);
7605           if (! CONSTANT_CLASS_P (new_name))
7606             new_name = vect_init_vector (phi, new_name,
7607                                          TREE_TYPE (step_expr), NULL);
7608           new_vec = build_vector_from_val (vectype, new_name);
7609           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7610           for (; ivn < nvects; ++ivn)
7611             {
7612               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7613               tree def;
7614               if (gimple_code (iv) == GIMPLE_PHI)
7615                 def = gimple_phi_result (iv);
7616               else
7617                 def = gimple_assign_lhs (iv);
7618               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7619                                               PLUS_EXPR,
7620                                               def, vec_step);
7621               if (gimple_code (iv) == GIMPLE_PHI)
7622                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7623               else
7624                 {
7625                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7626                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7627                 }
7628               set_vinfo_for_stmt (new_stmt,
7629                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7630               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7631             }
7632         }
7633
7634       return true;
7635     }
7636
7637   /* Create the vector that holds the initial_value of the induction.  */
7638   if (nested_in_vect_loop)
7639     {
7640       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7641          been created during vectorization of previous stmts.  We obtain it
7642          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7643       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7644       /* If the initial value is not of proper type, convert it.  */
7645       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7646         {
7647           new_stmt
7648             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7649                                                           vect_simple_var,
7650                                                           "vec_iv_"),
7651                                    VIEW_CONVERT_EXPR,
7652                                    build1 (VIEW_CONVERT_EXPR, vectype,
7653                                            vec_init));
7654           vec_init = gimple_assign_lhs (new_stmt);
7655           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7656                                                  new_stmt);
7657           gcc_assert (!new_bb);
7658           set_vinfo_for_stmt (new_stmt,
7659                               new_stmt_vec_info (new_stmt, loop_vinfo));
7660         }
7661     }
7662   else
7663     {
7664       /* iv_loop is the loop to be vectorized. Create:
7665          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7666       stmts = NULL;
7667       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7668
7669       unsigned HOST_WIDE_INT const_nunits;
7670       if (nunits.is_constant (&const_nunits))
7671         {
7672           tree_vector_builder elts (vectype, const_nunits, 1);
7673           elts.quick_push (new_name);
7674           for (i = 1; i < const_nunits; i++)
7675             {
7676               /* Create: new_name_i = new_name + step_expr  */
7677               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7678                                        new_name, step_expr);
7679               elts.quick_push (new_name);
7680             }
7681           /* Create a vector from [new_name_0, new_name_1, ...,
7682              new_name_nunits-1]  */
7683           vec_init = gimple_build_vector (&stmts, &elts);
7684         }
7685       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7686         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7687         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7688                                  new_name, step_expr);
7689       else
7690         {
7691           /* Build:
7692                 [base, base, base, ...]
7693                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7694           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7695           gcc_assert (flag_associative_math);
7696           tree index = build_index_vector (vectype, 0, 1);
7697           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7698                                                         new_name);
7699           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7700                                                         step_expr);
7701           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7702           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7703                                    vec_init, step_vec);
7704           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7705                                    vec_init, base_vec);
7706         }
7707
7708       if (stmts)
7709         {
7710           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7711           gcc_assert (!new_bb);
7712         }
7713     }
7714
7715
7716   /* Create the vector that holds the step of the induction.  */
7717   if (nested_in_vect_loop)
7718     /* iv_loop is nested in the loop to be vectorized. Generate:
7719        vec_step = [S, S, S, S]  */
7720     new_name = step_expr;
7721   else
7722     {
7723       /* iv_loop is the loop to be vectorized. Generate:
7724           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7725       gimple_seq seq = NULL;
7726       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7727         {
7728           expr = build_int_cst (integer_type_node, vf);
7729           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7730         }
7731       else
7732         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7733       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7734                                expr, step_expr);
7735       if (seq)
7736         {
7737           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7738           gcc_assert (!new_bb);
7739         }
7740     }
7741
7742   t = unshare_expr (new_name);
7743   gcc_assert (CONSTANT_CLASS_P (new_name)
7744               || TREE_CODE (new_name) == SSA_NAME);
7745   new_vec = build_vector_from_val (vectype, t);
7746   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7747
7748
7749   /* Create the following def-use cycle:
7750      loop prolog:
7751          vec_init = ...
7752          vec_step = ...
7753      loop:
7754          vec_iv = PHI <vec_init, vec_loop>
7755          ...
7756          STMT
7757          ...
7758          vec_loop = vec_iv + vec_step;  */
7759
7760   /* Create the induction-phi that defines the induction-operand.  */
7761   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7762   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7763   set_vinfo_for_stmt (induction_phi,
7764                       new_stmt_vec_info (induction_phi, loop_vinfo));
7765   induc_def = PHI_RESULT (induction_phi);
7766
7767   /* Create the iv update inside the loop  */
7768   vec_def = make_ssa_name (vec_dest);
7769   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7770   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7771   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7772
7773   /* Set the arguments of the phi node:  */
7774   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7775   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7776                UNKNOWN_LOCATION);
7777
7778   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7779
7780   /* In case that vectorization factor (VF) is bigger than the number
7781      of elements that we can fit in a vectype (nunits), we have to generate
7782      more than one vector stmt - i.e - we need to "unroll" the
7783      vector stmt by a factor VF/nunits.  For more details see documentation
7784      in vectorizable_operation.  */
7785
7786   if (ncopies > 1)
7787     {
7788       gimple_seq seq = NULL;
7789       stmt_vec_info prev_stmt_vinfo;
7790       /* FORNOW. This restriction should be relaxed.  */
7791       gcc_assert (!nested_in_vect_loop);
7792
7793       /* Create the vector that holds the step of the induction.  */
7794       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7795         {
7796           expr = build_int_cst (integer_type_node, nunits);
7797           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7798         }
7799       else
7800         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7801       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7802                                expr, step_expr);
7803       if (seq)
7804         {
7805           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7806           gcc_assert (!new_bb);
7807         }
7808
7809       t = unshare_expr (new_name);
7810       gcc_assert (CONSTANT_CLASS_P (new_name)
7811                   || TREE_CODE (new_name) == SSA_NAME);
7812       new_vec = build_vector_from_val (vectype, t);
7813       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7814
7815       vec_def = induc_def;
7816       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7817       for (i = 1; i < ncopies; i++)
7818         {
7819           /* vec_i = vec_prev + vec_step  */
7820           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7821                                           vec_def, vec_step);
7822           vec_def = make_ssa_name (vec_dest, new_stmt);
7823           gimple_assign_set_lhs (new_stmt, vec_def);
7824
7825           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7826           set_vinfo_for_stmt (new_stmt,
7827                               new_stmt_vec_info (new_stmt, loop_vinfo));
7828           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7829           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7830         }
7831     }
7832
7833   if (nested_in_vect_loop)
7834     {
7835       /* Find the loop-closed exit-phi of the induction, and record
7836          the final vector of induction results:  */
7837       exit_phi = NULL;
7838       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7839         {
7840           gimple *use_stmt = USE_STMT (use_p);
7841           if (is_gimple_debug (use_stmt))
7842             continue;
7843
7844           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7845             {
7846               exit_phi = use_stmt;
7847               break;
7848             }
7849         }
7850       if (exit_phi)
7851         {
7852           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7853           /* FORNOW. Currently not supporting the case that an inner-loop induction
7854              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7855           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7856                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7857
7858           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7859           if (dump_enabled_p ())
7860             {
7861               dump_printf_loc (MSG_NOTE, vect_location,
7862                                "vector of inductions after inner-loop:");
7863               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7864             }
7865         }
7866     }
7867
7868
7869   if (dump_enabled_p ())
7870     {
7871       dump_printf_loc (MSG_NOTE, vect_location,
7872                        "transform induction: created def-use cycle: ");
7873       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7874       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7875                         SSA_NAME_DEF_STMT (vec_def), 0);
7876     }
7877
7878   return true;
7879 }
7880
7881 /* Function vectorizable_live_operation.
7882
7883    STMT computes a value that is used outside the loop.  Check if
7884    it can be supported.  */
7885
7886 bool
7887 vectorizable_live_operation (gimple *stmt,
7888                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7889                              slp_tree slp_node, int slp_index,
7890                              gimple **vec_stmt,
7891                              stmt_vector_for_cost *)
7892 {
7893   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7894   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7895   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7896   imm_use_iterator imm_iter;
7897   tree lhs, lhs_type, bitsize, vec_bitsize;
7898   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7899   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7900   int ncopies;
7901   gimple *use_stmt;
7902   auto_vec<tree> vec_oprnds;
7903   int vec_entry = 0;
7904   poly_uint64 vec_index = 0;
7905
7906   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7907
7908   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7909     return false;
7910
7911   /* FORNOW.  CHECKME.  */
7912   if (nested_in_vect_loop_p (loop, stmt))
7913     return false;
7914
7915   /* If STMT is not relevant and it is a simple assignment and its inputs are
7916      invariant then it can remain in place, unvectorized.  The original last
7917      scalar value that it computes will be used.  */
7918   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7919     {
7920       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7921       if (dump_enabled_p ())
7922         dump_printf_loc (MSG_NOTE, vect_location,
7923                          "statement is simple and uses invariant.  Leaving in "
7924                          "place.\n");
7925       return true;
7926     }
7927
7928   if (slp_node)
7929     ncopies = 1;
7930   else
7931     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7932
7933   if (slp_node)
7934     {
7935       gcc_assert (slp_index >= 0);
7936
7937       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7938       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7939
7940       /* Get the last occurrence of the scalar index from the concatenation of
7941          all the slp vectors. Calculate which slp vector it is and the index
7942          within.  */
7943       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7944
7945       /* Calculate which vector contains the result, and which lane of
7946          that vector we need.  */
7947       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7948         {
7949           if (dump_enabled_p ())
7950             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7951                              "Cannot determine which vector holds the"
7952                              " final result.\n");
7953           return false;
7954         }
7955     }
7956
7957   if (!vec_stmt)
7958     {
7959       /* No transformation required.  */
7960       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7961         {
7962           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7963                                                OPTIMIZE_FOR_SPEED))
7964             {
7965               if (dump_enabled_p ())
7966                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7967                                  "can't use a fully-masked loop because "
7968                                  "the target doesn't support extract last "
7969                                  "reduction.\n");
7970               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7971             }
7972           else if (slp_node)
7973             {
7974               if (dump_enabled_p ())
7975                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7976                                  "can't use a fully-masked loop because an "
7977                                  "SLP statement is live after the loop.\n");
7978               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7979             }
7980           else if (ncopies > 1)
7981             {
7982               if (dump_enabled_p ())
7983                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7984                                  "can't use a fully-masked loop because"
7985                                  " ncopies is greater than 1.\n");
7986               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7987             }
7988           else
7989             {
7990               gcc_assert (ncopies == 1 && !slp_node);
7991               vect_record_loop_mask (loop_vinfo,
7992                                      &LOOP_VINFO_MASKS (loop_vinfo),
7993                                      1, vectype);
7994             }
7995         }
7996       return true;
7997     }
7998
7999   /* If stmt has a related stmt, then use that for getting the lhs.  */
8000   if (is_pattern_stmt_p (stmt_info))
8001     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8002
8003   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8004         : gimple_get_lhs (stmt);
8005   lhs_type = TREE_TYPE (lhs);
8006
8007   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8008              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8009              : TYPE_SIZE (TREE_TYPE (vectype)));
8010   vec_bitsize = TYPE_SIZE (vectype);
8011
8012   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8013   tree vec_lhs, bitstart;
8014   if (slp_node)
8015     {
8016       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8017
8018       /* Get the correct slp vectorized stmt.  */
8019       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8020       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8021         vec_lhs = gimple_phi_result (phi);
8022       else
8023         vec_lhs = gimple_get_lhs (vec_stmt);
8024
8025       /* Get entry to use.  */
8026       bitstart = bitsize_int (vec_index);
8027       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8028     }
8029   else
8030     {
8031       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8032       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8033       gcc_checking_assert (ncopies == 1
8034                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8035
8036       /* For multiple copies, get the last copy.  */
8037       for (int i = 1; i < ncopies; ++i)
8038         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8039                                                   vec_lhs);
8040
8041       /* Get the last lane in the vector.  */
8042       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8043     }
8044
8045   gimple_seq stmts = NULL;
8046   tree new_tree;
8047   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8048     {
8049       /* Emit:
8050
8051            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8052
8053          where VEC_LHS is the vectorized live-out result and MASK is
8054          the loop mask for the final iteration.  */
8055       gcc_assert (ncopies == 1 && !slp_node);
8056       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8057       tree scalar_res = make_ssa_name (scalar_type);
8058       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8059                                       1, vectype, 0);
8060       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8061                                                     2, mask, vec_lhs);
8062       gimple_call_set_lhs (new_stmt, scalar_res);
8063       gimple_seq_add_stmt (&stmts, new_stmt);
8064
8065       /* Convert the extracted vector element to the required scalar type.  */
8066       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8067     }
8068   else
8069     {
8070       tree bftype = TREE_TYPE (vectype);
8071       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8072         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8073       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8074       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8075                                        &stmts, true, NULL_TREE);
8076     }
8077
8078   if (stmts)
8079     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8080
8081   /* Replace use of lhs with newly computed result.  If the use stmt is a
8082      single arg PHI, just replace all uses of PHI result.  It's necessary
8083      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8084   use_operand_p use_p;
8085   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8086     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8087         && !is_gimple_debug (use_stmt))
8088     {
8089       if (gimple_code (use_stmt) == GIMPLE_PHI
8090           && gimple_phi_num_args (use_stmt) == 1)
8091         {
8092           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8093         }
8094       else
8095         {
8096           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8097             SET_USE (use_p, new_tree);
8098         }
8099       update_stmt (use_stmt);
8100     }
8101
8102   return true;
8103 }
8104
8105 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8106
8107 static void
8108 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8109 {
8110   ssa_op_iter op_iter;
8111   imm_use_iterator imm_iter;
8112   def_operand_p def_p;
8113   gimple *ustmt;
8114
8115   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8116     {
8117       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8118         {
8119           basic_block bb;
8120
8121           if (!is_gimple_debug (ustmt))
8122             continue;
8123
8124           bb = gimple_bb (ustmt);
8125
8126           if (!flow_bb_inside_loop_p (loop, bb))
8127             {
8128               if (gimple_debug_bind_p (ustmt))
8129                 {
8130                   if (dump_enabled_p ())
8131                     dump_printf_loc (MSG_NOTE, vect_location,
8132                                      "killing debug use\n");
8133
8134                   gimple_debug_bind_reset_value (ustmt);
8135                   update_stmt (ustmt);
8136                 }
8137               else
8138                 gcc_unreachable ();
8139             }
8140         }
8141     }
8142 }
8143
8144 /* Given loop represented by LOOP_VINFO, return true if computation of
8145    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8146    otherwise.  */
8147
8148 static bool
8149 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8150 {
8151   /* Constant case.  */
8152   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8153     {
8154       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8155       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8156
8157       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8158       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8159       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8160         return true;
8161     }
8162
8163   widest_int max;
8164   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8165   /* Check the upper bound of loop niters.  */
8166   if (get_max_loop_iterations (loop, &max))
8167     {
8168       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8169       signop sgn = TYPE_SIGN (type);
8170       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8171       if (max < type_max)
8172         return true;
8173     }
8174   return false;
8175 }
8176
8177 /* Return a mask type with half the number of elements as TYPE.  */
8178
8179 tree
8180 vect_halve_mask_nunits (tree type)
8181 {
8182   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8183   return build_truth_vector_type (nunits, current_vector_size);
8184 }
8185
8186 /* Return a mask type with twice as many elements as TYPE.  */
8187
8188 tree
8189 vect_double_mask_nunits (tree type)
8190 {
8191   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8192   return build_truth_vector_type (nunits, current_vector_size);
8193 }
8194
8195 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8196    contain a sequence of NVECTORS masks that each control a vector of type
8197    VECTYPE.  */
8198
8199 void
8200 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8201                        unsigned int nvectors, tree vectype)
8202 {
8203   gcc_assert (nvectors != 0);
8204   if (masks->length () < nvectors)
8205     masks->safe_grow_cleared (nvectors);
8206   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8207   /* The number of scalars per iteration and the number of vectors are
8208      both compile-time constants.  */
8209   unsigned int nscalars_per_iter
8210     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8211                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8212   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8213     {
8214       rgm->max_nscalars_per_iter = nscalars_per_iter;
8215       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8216     }
8217 }
8218
8219 /* Given a complete set of masks MASKS, extract mask number INDEX
8220    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8221    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8222
8223    See the comment above vec_loop_masks for more details about the mask
8224    arrangement.  */
8225
8226 tree
8227 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8228                     unsigned int nvectors, tree vectype, unsigned int index)
8229 {
8230   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8231   tree mask_type = rgm->mask_type;
8232
8233   /* Populate the rgroup's mask array, if this is the first time we've
8234      used it.  */
8235   if (rgm->masks.is_empty ())
8236     {
8237       rgm->masks.safe_grow_cleared (nvectors);
8238       for (unsigned int i = 0; i < nvectors; ++i)
8239         {
8240           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8241           /* Provide a dummy definition until the real one is available.  */
8242           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8243           rgm->masks[i] = mask;
8244         }
8245     }
8246
8247   tree mask = rgm->masks[index];
8248   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8249                 TYPE_VECTOR_SUBPARTS (vectype)))
8250     {
8251       /* A loop mask for data type X can be reused for data type Y
8252          if X has N times more elements than Y and if Y's elements
8253          are N times bigger than X's.  In this case each sequence
8254          of N elements in the loop mask will be all-zero or all-one.
8255          We can then view-convert the mask so that each sequence of
8256          N elements is replaced by a single element.  */
8257       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8258                               TYPE_VECTOR_SUBPARTS (vectype)));
8259       gimple_seq seq = NULL;
8260       mask_type = build_same_sized_truth_vector_type (vectype);
8261       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8262       if (seq)
8263         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8264     }
8265   return mask;
8266 }
8267
8268 /* Scale profiling counters by estimation for LOOP which is vectorized
8269    by factor VF.  */
8270
8271 static void
8272 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8273 {
8274   edge preheader = loop_preheader_edge (loop);
8275   /* Reduce loop iterations by the vectorization factor.  */
8276   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8277   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8278
8279   if (freq_h.nonzero_p ())
8280     {
8281       profile_probability p;
8282
8283       /* Avoid dropping loop body profile counter to 0 because of zero count
8284          in loop's preheader.  */
8285       if (!(freq_e == profile_count::zero ()))
8286         freq_e = freq_e.force_nonzero ();
8287       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8288       scale_loop_frequencies (loop, p);
8289     }
8290
8291   edge exit_e = single_exit (loop);
8292   exit_e->probability = profile_probability::always ()
8293                                  .apply_scale (1, new_est_niter + 1);
8294
8295   edge exit_l = single_pred_edge (loop->latch);
8296   profile_probability prob = exit_l->probability;
8297   exit_l->probability = exit_e->probability.invert ();
8298   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8299     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8300 }
8301
8302 /* Function vect_transform_loop.
8303
8304    The analysis phase has determined that the loop is vectorizable.
8305    Vectorize the loop - created vectorized stmts to replace the scalar
8306    stmts in the loop, and update the loop exit condition.
8307    Returns scalar epilogue loop if any.  */
8308
8309 struct loop *
8310 vect_transform_loop (loop_vec_info loop_vinfo)
8311 {
8312   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8313   struct loop *epilogue = NULL;
8314   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8315   int nbbs = loop->num_nodes;
8316   int i;
8317   tree niters_vector = NULL_TREE;
8318   tree step_vector = NULL_TREE;
8319   tree niters_vector_mult_vf = NULL_TREE;
8320   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8321   unsigned int lowest_vf = constant_lower_bound (vf);
8322   bool grouped_store;
8323   bool slp_scheduled = false;
8324   gimple *stmt, *pattern_stmt;
8325   gimple_seq pattern_def_seq = NULL;
8326   gimple_stmt_iterator pattern_def_si = gsi_none ();
8327   bool transform_pattern_stmt = false;
8328   bool check_profitability = false;
8329   unsigned int th;
8330
8331   if (dump_enabled_p ())
8332     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8333
8334   /* Use the more conservative vectorization threshold.  If the number
8335      of iterations is constant assume the cost check has been performed
8336      by our caller.  If the threshold makes all loops profitable that
8337      run at least the (estimated) vectorization factor number of times
8338      checking is pointless, too.  */
8339   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8340   if (th >= vect_vf_for_cost (loop_vinfo)
8341       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8342     {
8343       if (dump_enabled_p ())
8344         dump_printf_loc (MSG_NOTE, vect_location,
8345                          "Profitability threshold is %d loop iterations.\n",
8346                          th);
8347       check_profitability = true;
8348     }
8349
8350   /* Make sure there exists a single-predecessor exit bb.  Do this before
8351      versioning.   */
8352   edge e = single_exit (loop);
8353   if (! single_pred_p (e->dest))
8354     {
8355       split_loop_exit_edge (e);
8356       if (dump_enabled_p ())
8357         dump_printf (MSG_NOTE, "split exit edge\n");
8358     }
8359
8360   /* Version the loop first, if required, so the profitability check
8361      comes first.  */
8362
8363   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8364     {
8365       poly_uint64 versioning_threshold
8366         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8367       if (check_profitability
8368           && ordered_p (poly_uint64 (th), versioning_threshold))
8369         {
8370           versioning_threshold = ordered_max (poly_uint64 (th),
8371                                               versioning_threshold);
8372           check_profitability = false;
8373         }
8374       vect_loop_versioning (loop_vinfo, th, check_profitability,
8375                             versioning_threshold);
8376       check_profitability = false;
8377     }
8378
8379   /* Make sure there exists a single-predecessor exit bb also on the
8380      scalar loop copy.  Do this after versioning but before peeling
8381      so CFG structure is fine for both scalar and if-converted loop
8382      to make slpeel_duplicate_current_defs_from_edges face matched
8383      loop closed PHI nodes on the exit.  */
8384   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8385     {
8386       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8387       if (! single_pred_p (e->dest))
8388         {
8389           split_loop_exit_edge (e);
8390           if (dump_enabled_p ())
8391             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8392         }
8393     }
8394
8395   tree niters = vect_build_loop_niters (loop_vinfo);
8396   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8397   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8398   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8399   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8400                               &step_vector, &niters_vector_mult_vf, th,
8401                               check_profitability, niters_no_overflow);
8402
8403   if (niters_vector == NULL_TREE)
8404     {
8405       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8406           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8407           && known_eq (lowest_vf, vf))
8408         {
8409           niters_vector
8410             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8411                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8412           step_vector = build_one_cst (TREE_TYPE (niters));
8413         }
8414       else
8415         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8416                                      &step_vector, niters_no_overflow);
8417     }
8418
8419   /* 1) Make sure the loop header has exactly two entries
8420      2) Make sure we have a preheader basic block.  */
8421
8422   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8423
8424   split_edge (loop_preheader_edge (loop));
8425
8426   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8427       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8428     /* This will deal with any possible peeling.  */
8429     vect_prepare_for_masked_peels (loop_vinfo);
8430
8431   /* FORNOW: the vectorizer supports only loops which body consist
8432      of one basic block (header + empty latch). When the vectorizer will
8433      support more involved loop forms, the order by which the BBs are
8434      traversed need to be reconsidered.  */
8435
8436   for (i = 0; i < nbbs; i++)
8437     {
8438       basic_block bb = bbs[i];
8439       stmt_vec_info stmt_info;
8440
8441       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8442            gsi_next (&si))
8443         {
8444           gphi *phi = si.phi ();
8445           if (dump_enabled_p ())
8446             {
8447               dump_printf_loc (MSG_NOTE, vect_location,
8448                                "------>vectorizing phi: ");
8449               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8450             }
8451           stmt_info = vinfo_for_stmt (phi);
8452           if (!stmt_info)
8453             continue;
8454
8455           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8456             vect_loop_kill_debug_uses (loop, phi);
8457
8458           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8459               && !STMT_VINFO_LIVE_P (stmt_info))
8460             continue;
8461
8462           if (STMT_VINFO_VECTYPE (stmt_info)
8463               && (maybe_ne
8464                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8465               && dump_enabled_p ())
8466             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8467
8468           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8469                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8470                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8471               && ! PURE_SLP_STMT (stmt_info))
8472             {
8473               if (dump_enabled_p ())
8474                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8475               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8476             }
8477         }
8478
8479       pattern_stmt = NULL;
8480       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8481            !gsi_end_p (si) || transform_pattern_stmt;)
8482         {
8483           bool is_store;
8484
8485           if (transform_pattern_stmt)
8486             stmt = pattern_stmt;
8487           else
8488             {
8489               stmt = gsi_stmt (si);
8490               /* During vectorization remove existing clobber stmts.  */
8491               if (gimple_clobber_p (stmt))
8492                 {
8493                   unlink_stmt_vdef (stmt);
8494                   gsi_remove (&si, true);
8495                   release_defs (stmt);
8496                   continue;
8497                 }
8498             }
8499
8500           if (dump_enabled_p ())
8501             {
8502               dump_printf_loc (MSG_NOTE, vect_location,
8503                                "------>vectorizing statement: ");
8504               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8505             }
8506
8507           stmt_info = vinfo_for_stmt (stmt);
8508
8509           /* vector stmts created in the outer-loop during vectorization of
8510              stmts in an inner-loop may not have a stmt_info, and do not
8511              need to be vectorized.  */
8512           if (!stmt_info)
8513             {
8514               gsi_next (&si);
8515               continue;
8516             }
8517
8518           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8519             vect_loop_kill_debug_uses (loop, stmt);
8520
8521           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8522               && !STMT_VINFO_LIVE_P (stmt_info))
8523             {
8524               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8525                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8526                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8527                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8528                 {
8529                   stmt = pattern_stmt;
8530                   stmt_info = vinfo_for_stmt (stmt);
8531                 }
8532               else
8533                 {
8534                   gsi_next (&si);
8535                   continue;
8536                 }
8537             }
8538           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8539                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8540                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8541                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8542             transform_pattern_stmt = true;
8543
8544           /* If pattern statement has def stmts, vectorize them too.  */
8545           if (is_pattern_stmt_p (stmt_info))
8546             {
8547               if (pattern_def_seq == NULL)
8548                 {
8549                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8550                   pattern_def_si = gsi_start (pattern_def_seq);
8551                 }
8552               else if (!gsi_end_p (pattern_def_si))
8553                 gsi_next (&pattern_def_si);
8554               if (pattern_def_seq != NULL)
8555                 {
8556                   gimple *pattern_def_stmt = NULL;
8557                   stmt_vec_info pattern_def_stmt_info = NULL;
8558
8559                   while (!gsi_end_p (pattern_def_si))
8560                     {
8561                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8562                       pattern_def_stmt_info
8563                         = vinfo_for_stmt (pattern_def_stmt);
8564                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8565                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8566                         break;
8567                       gsi_next (&pattern_def_si);
8568                     }
8569
8570                   if (!gsi_end_p (pattern_def_si))
8571                     {
8572                       if (dump_enabled_p ())
8573                         {
8574                           dump_printf_loc (MSG_NOTE, vect_location,
8575                                            "==> vectorizing pattern def "
8576                                            "stmt: ");
8577                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8578                                             pattern_def_stmt, 0);
8579                         }
8580
8581                       stmt = pattern_def_stmt;
8582                       stmt_info = pattern_def_stmt_info;
8583                     }
8584                   else
8585                     {
8586                       pattern_def_si = gsi_none ();
8587                       transform_pattern_stmt = false;
8588                     }
8589                 }
8590               else
8591                 transform_pattern_stmt = false;
8592             }
8593
8594           if (STMT_VINFO_VECTYPE (stmt_info))
8595             {
8596               poly_uint64 nunits
8597                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8598               if (!STMT_SLP_TYPE (stmt_info)
8599                   && maybe_ne (nunits, vf)
8600                   && dump_enabled_p ())
8601                   /* For SLP VF is set according to unrolling factor, and not
8602                      to vector size, hence for SLP this print is not valid.  */
8603                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8604             }
8605
8606           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8607              reached.  */
8608           if (STMT_SLP_TYPE (stmt_info))
8609             {
8610               if (!slp_scheduled)
8611                 {
8612                   slp_scheduled = true;
8613
8614                   if (dump_enabled_p ())
8615                     dump_printf_loc (MSG_NOTE, vect_location,
8616                                      "=== scheduling SLP instances ===\n");
8617
8618                   vect_schedule_slp (loop_vinfo);
8619                 }
8620
8621               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8622               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8623                 {
8624                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8625                     {
8626                       pattern_def_seq = NULL;
8627                       gsi_next (&si);
8628                     }
8629                   continue;
8630                 }
8631             }
8632
8633           /* -------- vectorize statement ------------ */
8634           if (dump_enabled_p ())
8635             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8636
8637           grouped_store = false;
8638           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8639           if (is_store)
8640             {
8641               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8642                 {
8643                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8644                      interleaving chain was completed - free all the stores in
8645                      the chain.  */
8646                   gsi_next (&si);
8647                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8648                 }
8649               else
8650                 {
8651                   /* Free the attached stmt_vec_info and remove the stmt.  */
8652                   gimple *store = gsi_stmt (si);
8653                   free_stmt_vec_info (store);
8654                   unlink_stmt_vdef (store);
8655                   gsi_remove (&si, true);
8656                   release_defs (store);
8657                 }
8658
8659               /* Stores can only appear at the end of pattern statements.  */
8660               gcc_assert (!transform_pattern_stmt);
8661               pattern_def_seq = NULL;
8662             }
8663           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8664             {
8665               pattern_def_seq = NULL;
8666               gsi_next (&si);
8667             }
8668         }                       /* stmts in BB */
8669
8670       /* Stub out scalar statements that must not survive vectorization.
8671          Doing this here helps with grouped statements, or statements that
8672          are involved in patterns.  */
8673       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8674            !gsi_end_p (gsi); gsi_next (&gsi))
8675         {
8676           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8677           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8678             {
8679               tree lhs = gimple_get_lhs (call);
8680               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8681                 {
8682                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8683                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8684                   gsi_replace (&gsi, new_stmt, true);
8685                 }
8686             }
8687         }
8688     }                           /* BBs in loop */
8689
8690   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8691      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8692   if (integer_onep (step_vector))
8693     niters_no_overflow = true;
8694   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8695                            niters_vector_mult_vf, !niters_no_overflow);
8696
8697   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8698   scale_profile_for_vect_loop (loop, assumed_vf);
8699
8700   /* True if the final iteration might not handle a full vector's
8701      worth of scalar iterations.  */
8702   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8703   /* The minimum number of iterations performed by the epilogue.  This
8704      is 1 when peeling for gaps because we always need a final scalar
8705      iteration.  */
8706   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8707   /* +1 to convert latch counts to loop iteration counts,
8708      -min_epilogue_iters to remove iterations that cannot be performed
8709        by the vector code.  */
8710   int bias_for_lowest = 1 - min_epilogue_iters;
8711   int bias_for_assumed = bias_for_lowest;
8712   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8713   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8714     {
8715       /* When the amount of peeling is known at compile time, the first
8716          iteration will have exactly alignment_npeels active elements.
8717          In the worst case it will have at least one.  */
8718       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8719       bias_for_lowest += lowest_vf - min_first_active;
8720       bias_for_assumed += assumed_vf - min_first_active;
8721     }
8722   /* In these calculations the "- 1" converts loop iteration counts
8723      back to latch counts.  */
8724   if (loop->any_upper_bound)
8725     loop->nb_iterations_upper_bound
8726       = (final_iter_may_be_partial
8727          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8728                           lowest_vf) - 1
8729          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8730                            lowest_vf) - 1);
8731   if (loop->any_likely_upper_bound)
8732     loop->nb_iterations_likely_upper_bound
8733       = (final_iter_may_be_partial
8734          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8735                           + bias_for_lowest, lowest_vf) - 1
8736          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8737                            + bias_for_lowest, lowest_vf) - 1);
8738   if (loop->any_estimate)
8739     loop->nb_iterations_estimate
8740       = (final_iter_may_be_partial
8741          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8742                           assumed_vf) - 1
8743          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8744                            assumed_vf) - 1);
8745
8746   if (dump_enabled_p ())
8747     {
8748       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8749         {
8750           dump_printf_loc (MSG_NOTE, vect_location,
8751                            "LOOP VECTORIZED\n");
8752           if (loop->inner)
8753             dump_printf_loc (MSG_NOTE, vect_location,
8754                              "OUTER LOOP VECTORIZED\n");
8755           dump_printf (MSG_NOTE, "\n");
8756         }
8757       else
8758         {
8759           dump_printf_loc (MSG_NOTE, vect_location,
8760                            "LOOP EPILOGUE VECTORIZED (VS=");
8761           dump_dec (MSG_NOTE, current_vector_size);
8762           dump_printf (MSG_NOTE, ")\n");
8763         }
8764     }
8765
8766   /* Free SLP instances here because otherwise stmt reference counting
8767      won't work.  */
8768   slp_instance instance;
8769   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8770     vect_free_slp_instance (instance);
8771   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8772   /* Clear-up safelen field since its value is invalid after vectorization
8773      since vectorized loop can have loop-carried dependencies.  */
8774   loop->safelen = 0;
8775
8776   /* Don't vectorize epilogue for epilogue.  */
8777   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8778     epilogue = NULL;
8779
8780   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8781     epilogue = NULL;
8782
8783   if (epilogue)
8784     {
8785       auto_vector_sizes vector_sizes;
8786       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8787       unsigned int next_size = 0;
8788
8789       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8790           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8791           && known_eq (vf, lowest_vf))
8792         {
8793           unsigned int eiters
8794             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8795                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8796           eiters = eiters % lowest_vf;
8797           epilogue->nb_iterations_upper_bound = eiters - 1;
8798
8799           unsigned int ratio;
8800           while (next_size < vector_sizes.length ()
8801                  && !(constant_multiple_p (current_vector_size,
8802                                            vector_sizes[next_size], &ratio)
8803                       && eiters >= lowest_vf / ratio))
8804             next_size += 1;
8805         }
8806       else
8807         while (next_size < vector_sizes.length ()
8808                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8809           next_size += 1;
8810
8811       if (next_size == vector_sizes.length ())
8812         epilogue = NULL;
8813     }
8814
8815   if (epilogue)
8816     {
8817       epilogue->force_vectorize = loop->force_vectorize;
8818       epilogue->safelen = loop->safelen;
8819       epilogue->dont_vectorize = false;
8820
8821       /* We may need to if-convert epilogue to vectorize it.  */
8822       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8823         tree_if_conversion (epilogue);
8824     }
8825
8826   return epilogue;
8827 }
8828
8829 /* The code below is trying to perform simple optimization - revert
8830    if-conversion for masked stores, i.e. if the mask of a store is zero
8831    do not perform it and all stored value producers also if possible.
8832    For example,
8833      for (i=0; i<n; i++)
8834        if (c[i])
8835         {
8836           p1[i] += 1;
8837           p2[i] = p3[i] +2;
8838         }
8839    this transformation will produce the following semi-hammock:
8840
8841    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8842      {
8843        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8844        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8845        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8846        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8847        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8848        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8849      }
8850 */
8851
8852 void
8853 optimize_mask_stores (struct loop *loop)
8854 {
8855   basic_block *bbs = get_loop_body (loop);
8856   unsigned nbbs = loop->num_nodes;
8857   unsigned i;
8858   basic_block bb;
8859   struct loop *bb_loop;
8860   gimple_stmt_iterator gsi;
8861   gimple *stmt;
8862   auto_vec<gimple *> worklist;
8863
8864   vect_location = find_loop_location (loop);
8865   /* Pick up all masked stores in loop if any.  */
8866   for (i = 0; i < nbbs; i++)
8867     {
8868       bb = bbs[i];
8869       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8870            gsi_next (&gsi))
8871         {
8872           stmt = gsi_stmt (gsi);
8873           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8874             worklist.safe_push (stmt);
8875         }
8876     }
8877
8878   free (bbs);
8879   if (worklist.is_empty ())
8880     return;
8881
8882   /* Loop has masked stores.  */
8883   while (!worklist.is_empty ())
8884     {
8885       gimple *last, *last_store;
8886       edge e, efalse;
8887       tree mask;
8888       basic_block store_bb, join_bb;
8889       gimple_stmt_iterator gsi_to;
8890       tree vdef, new_vdef;
8891       gphi *phi;
8892       tree vectype;
8893       tree zero;
8894
8895       last = worklist.pop ();
8896       mask = gimple_call_arg (last, 2);
8897       bb = gimple_bb (last);
8898       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8899          the same loop as if_bb.  It could be different to LOOP when two
8900          level loop-nest is vectorized and mask_store belongs to the inner
8901          one.  */
8902       e = split_block (bb, last);
8903       bb_loop = bb->loop_father;
8904       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8905       join_bb = e->dest;
8906       store_bb = create_empty_bb (bb);
8907       add_bb_to_loop (store_bb, bb_loop);
8908       e->flags = EDGE_TRUE_VALUE;
8909       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8910       /* Put STORE_BB to likely part.  */
8911       efalse->probability = profile_probability::unlikely ();
8912       store_bb->count = efalse->count ();
8913       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8914       if (dom_info_available_p (CDI_DOMINATORS))
8915         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8916       if (dump_enabled_p ())
8917         dump_printf_loc (MSG_NOTE, vect_location,
8918                          "Create new block %d to sink mask stores.",
8919                          store_bb->index);
8920       /* Create vector comparison with boolean result.  */
8921       vectype = TREE_TYPE (mask);
8922       zero = build_zero_cst (vectype);
8923       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8924       gsi = gsi_last_bb (bb);
8925       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8926       /* Create new PHI node for vdef of the last masked store:
8927          .MEM_2 = VDEF <.MEM_1>
8928          will be converted to
8929          .MEM.3 = VDEF <.MEM_1>
8930          and new PHI node will be created in join bb
8931          .MEM_2 = PHI <.MEM_1, .MEM_3>
8932       */
8933       vdef = gimple_vdef (last);
8934       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8935       gimple_set_vdef (last, new_vdef);
8936       phi = create_phi_node (vdef, join_bb);
8937       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8938
8939       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8940       while (true)
8941         {
8942           gimple_stmt_iterator gsi_from;
8943           gimple *stmt1 = NULL;
8944
8945           /* Move masked store to STORE_BB.  */
8946           last_store = last;
8947           gsi = gsi_for_stmt (last);
8948           gsi_from = gsi;
8949           /* Shift GSI to the previous stmt for further traversal.  */
8950           gsi_prev (&gsi);
8951           gsi_to = gsi_start_bb (store_bb);
8952           gsi_move_before (&gsi_from, &gsi_to);
8953           /* Setup GSI_TO to the non-empty block start.  */
8954           gsi_to = gsi_start_bb (store_bb);
8955           if (dump_enabled_p ())
8956             {
8957               dump_printf_loc (MSG_NOTE, vect_location,
8958                                "Move stmt to created bb\n");
8959               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8960             }
8961           /* Move all stored value producers if possible.  */
8962           while (!gsi_end_p (gsi))
8963             {
8964               tree lhs;
8965               imm_use_iterator imm_iter;
8966               use_operand_p use_p;
8967               bool res;
8968
8969               /* Skip debug statements.  */
8970               if (is_gimple_debug (gsi_stmt (gsi)))
8971                 {
8972                   gsi_prev (&gsi);
8973                   continue;
8974                 }
8975               stmt1 = gsi_stmt (gsi);
8976               /* Do not consider statements writing to memory or having
8977                  volatile operand.  */
8978               if (gimple_vdef (stmt1)
8979                   || gimple_has_volatile_ops (stmt1))
8980                 break;
8981               gsi_from = gsi;
8982               gsi_prev (&gsi);
8983               lhs = gimple_get_lhs (stmt1);
8984               if (!lhs)
8985                 break;
8986
8987               /* LHS of vectorized stmt must be SSA_NAME.  */
8988               if (TREE_CODE (lhs) != SSA_NAME)
8989                 break;
8990
8991               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8992                 {
8993                   /* Remove dead scalar statement.  */
8994                   if (has_zero_uses (lhs))
8995                     {
8996                       gsi_remove (&gsi_from, true);
8997                       continue;
8998                     }
8999                 }
9000
9001               /* Check that LHS does not have uses outside of STORE_BB.  */
9002               res = true;
9003               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9004                 {
9005                   gimple *use_stmt;
9006                   use_stmt = USE_STMT (use_p);
9007                   if (is_gimple_debug (use_stmt))
9008                     continue;
9009                   if (gimple_bb (use_stmt) != store_bb)
9010                     {
9011                       res = false;
9012                       break;
9013                     }
9014                 }
9015               if (!res)
9016                 break;
9017
9018               if (gimple_vuse (stmt1)
9019                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9020                 break;
9021
9022               /* Can move STMT1 to STORE_BB.  */
9023               if (dump_enabled_p ())
9024                 {
9025                   dump_printf_loc (MSG_NOTE, vect_location,
9026                                    "Move stmt to created bb\n");
9027                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9028                 }
9029               gsi_move_before (&gsi_from, &gsi_to);
9030               /* Shift GSI_TO for further insertion.  */
9031               gsi_prev (&gsi_to);
9032             }
9033           /* Put other masked stores with the same mask to STORE_BB.  */
9034           if (worklist.is_empty ()
9035               || gimple_call_arg (worklist.last (), 2) != mask
9036               || worklist.last () != stmt1)
9037             break;
9038           last = worklist.pop ();
9039         }
9040       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9041     }
9042 }