gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   if (dump_enabled_p ())
 217     {
 218       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 219       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 220     }
 221   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 222     return false;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 228
 229       /* If a pattern statement has def stmts, analyze them too.  */
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo_for_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             {
 237               dump_printf_loc (MSG_NOTE, vect_location,
 238                                "==> examining pattern def stmt: ");
 239               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 240                                 def_stmt_info->stmt, 0);
 241             }
 242           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 243                                              vf, mask_producers))
 244             return false;
 245         }
 246
 247       if (dump_enabled_p ())
 248         {
 249           dump_printf_loc (MSG_NOTE, vect_location,
 250                            "==> examining pattern statement: ");
 251           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 252         }
 253       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 254         return false;
 255     }
 256
 257   return true;
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static bool
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297   auto_vec<stmt_vec_info> mask_producers;
 298
 299   if (dump_enabled_p ())
 300     dump_printf_loc (MSG_NOTE, vect_location,
 301                      "=== vect_determine_vectorization_factor ===\n");
 302
 303   for (i = 0; i < nbbs; i++)
 304     {
 305       basic_block bb = bbs[i];
 306
 307       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 308            gsi_next (&si))
 309         {
 310           phi = si.phi ();
 311           stmt_info = vinfo_for_stmt (phi);
 312           if (dump_enabled_p ())
 313             {
 314               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 315               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 316             }
 317
 318           gcc_assert (stmt_info);
 319
 320           if (STMT_VINFO_RELEVANT_P (stmt_info)
 321               || STMT_VINFO_LIVE_P (stmt_info))
 322             {
 323               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 324               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 325
 326               if (dump_enabled_p ())
 327                 {
 328                   dump_printf_loc (MSG_NOTE, vect_location,
 329                                    "get vectype for scalar type:  ");
 330                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 331                   dump_printf (MSG_NOTE, "\n");
 332                 }
 333
 334               vectype = get_vectype_for_scalar_type (scalar_type);
 335               if (!vectype)
 336                 {
 337                   if (dump_enabled_p ())
 338                     {
 339                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 340                                        "not vectorized: unsupported "
 341                                        "data-type ");
 342                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 343                                          scalar_type);
 344                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 345                     }
 346                   return false;
 347                 }
 348               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 349
 350               if (dump_enabled_p ())
 351                 {
 352                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 353                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 354                   dump_printf (MSG_NOTE, "\n");
 355                 }
 356
 357               if (dump_enabled_p ())
 358                 {
 359                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 360                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 361                   dump_printf (MSG_NOTE, "\n");
 362                 }
 363
 364               vect_update_max_nunits (&vectorization_factor, vectype);
 365             }
 366         }
 367
 368       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 369            gsi_next (&si))
 370         {
 371           stmt_info = vinfo_for_stmt (gsi_stmt (si));
 372           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 373                                            &mask_producers))
 374             return false;
 375         }
 376     }
 377
 378   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 379   if (dump_enabled_p ())
 380     {
 381       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 382       dump_dec (MSG_NOTE, vectorization_factor);
 383       dump_printf (MSG_NOTE, "\n");
 384     }
 385
 386   if (known_le (vectorization_factor, 1U))
 387     {
 388       if (dump_enabled_p ())
 389         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 390                          "not vectorized: unsupported data-type\n");
 391       return false;
 392     }
 393   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 394
 395   for (i = 0; i < mask_producers.length (); i++)
 396     {
 397       stmt_info = mask_producers[i];
 398       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 399       if (!mask_type)
 400         return false;
 401       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 402     }
 403
 404   return true;
 405 }
 406
 407
 408 /* Function vect_is_simple_iv_evolution.
 409
 410    FORNOW: A simple evolution of an induction variables in the loop is
 411    considered a polynomial evolution.  */
 412
 413 static bool
 414 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 415                              tree * step)
 416 {
 417   tree init_expr;
 418   tree step_expr;
 419   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 420   basic_block bb;
 421
 422   /* When there is no evolution in this loop, the evolution function
 423      is not "simple".  */
 424   if (evolution_part == NULL_TREE)
 425     return false;
 426
 427   /* When the evolution is a polynomial of degree >= 2
 428      the evolution function is not "simple".  */
 429   if (tree_is_chrec (evolution_part))
 430     return false;
 431
 432   step_expr = evolution_part;
 433   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 434
 435   if (dump_enabled_p ())
 436     {
 437       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 438       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 439       dump_printf (MSG_NOTE, ",  init: ");
 440       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 441       dump_printf (MSG_NOTE, "\n");
 442     }
 443
 444   *init = init_expr;
 445   *step = step_expr;
 446
 447   if (TREE_CODE (step_expr) != INTEGER_CST
 448       && (TREE_CODE (step_expr) != SSA_NAME
 449           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 450               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 451           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 452               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 453                   || !flag_associative_math)))
 454       && (TREE_CODE (step_expr) != REAL_CST
 455           || !flag_associative_math))
 456     {
 457       if (dump_enabled_p ())
 458         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 459                          "step unknown.\n");
 460       return false;
 461     }
 462
 463   return true;
 464 }
 465
 466 /* Function vect_analyze_scalar_cycles_1.
 467
 468    Examine the cross iteration def-use cycles of scalar variables
 469    in LOOP.  LOOP_VINFO represents the loop that is now being
 470    considered for vectorization (can be LOOP, or an outer-loop
 471    enclosing LOOP).  */
 472
 473 static void
 474 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 475 {
 476   basic_block bb = loop->header;
 477   tree init, step;
 478   auto_vec<gimple *, 64> worklist;
 479   gphi_iterator gsi;
 480   bool double_reduc;
 481
 482   if (dump_enabled_p ())
 483     dump_printf_loc (MSG_NOTE, vect_location,
 484                      "=== vect_analyze_scalar_cycles ===\n");
 485
 486   /* First - identify all inductions.  Reduction detection assumes that all the
 487      inductions have been identified, therefore, this order must not be
 488      changed.  */
 489   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 490     {
 491       gphi *phi = gsi.phi ();
 492       tree access_fn = NULL;
 493       tree def = PHI_RESULT (phi);
 494       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 495
 496       if (dump_enabled_p ())
 497         {
 498           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 499           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 500         }
 501
 502       /* Skip virtual phi's.  The data dependences that are associated with
 503          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 504       if (virtual_operand_p (def))
 505         continue;
 506
 507       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 508
 509       /* Analyze the evolution function.  */
 510       access_fn = analyze_scalar_evolution (loop, def);
 511       if (access_fn)
 512         {
 513           STRIP_NOPS (access_fn);
 514           if (dump_enabled_p ())
 515             {
 516               dump_printf_loc (MSG_NOTE, vect_location,
 517                                "Access function of PHI: ");
 518               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 519               dump_printf (MSG_NOTE, "\n");
 520             }
 521           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522             = initial_condition_in_loop_num (access_fn, loop->num);
 523           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 524             = evolution_part_in_loop_num (access_fn, loop->num);
 525         }
 526
 527       if (!access_fn
 528           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 529           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 530               && TREE_CODE (step) != INTEGER_CST))
 531         {
 532           worklist.safe_push (phi);
 533           continue;
 534         }
 535
 536       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 537                   != NULL_TREE);
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 539
 540       if (dump_enabled_p ())
 541         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 542       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 543     }
 544
 545
 546   /* Second - identify all reductions and nested cycles.  */
 547   while (worklist.length () > 0)
 548     {
 549       gimple *phi = worklist.pop ();
 550       tree def = PHI_RESULT (phi);
 551       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 552       gimple *reduc_stmt;
 553
 554       if (dump_enabled_p ())
 555         {
 556           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 557           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 558         }
 559
 560       gcc_assert (!virtual_operand_p (def)
 561                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 562
 563       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 564                                                 &double_reduc, false);
 565       if (reduc_stmt)
 566         {
 567           if (double_reduc)
 568             {
 569               if (dump_enabled_p ())
 570                 dump_printf_loc (MSG_NOTE, vect_location,
 571                                  "Detected double reduction.\n");
 572
 573               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 574               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 575                                                     vect_double_reduction_def;
 576             }
 577           else
 578             {
 579               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 580                 {
 581                   if (dump_enabled_p ())
 582                     dump_printf_loc (MSG_NOTE, vect_location,
 583                                      "Detected vectorizable nested cycle.\n");
 584
 585                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 586                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 587                                                              vect_nested_cycle;
 588                 }
 589               else
 590                 {
 591                   if (dump_enabled_p ())
 592                     dump_printf_loc (MSG_NOTE, vect_location,
 593                                      "Detected reduction.\n");
 594
 595                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 596                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 597                                                            vect_reduction_def;
 598                   /* Store the reduction cycles for possible vectorization in
 599                      loop-aware SLP if it was not detected as reduction
 600                      chain.  */
 601                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 602                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 603                 }
 604             }
 605         }
 606       else
 607         if (dump_enabled_p ())
 608           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 609                            "Unknown def-use cycle pattern.\n");
 610     }
 611 }
 612
 613
 614 /* Function vect_analyze_scalar_cycles.
 615
 616    Examine the cross iteration def-use cycles of scalar variables, by
 617    analyzing the loop-header PHIs of scalar variables.  Classify each
 618    cycle as one of the following: invariant, induction, reduction, unknown.
 619    We do that for the loop represented by LOOP_VINFO, and also to its
 620    inner-loop, if exists.
 621    Examples for scalar cycles:
 622
 623    Example1: reduction:
 624
 625               loop1:
 626               for (i=0; i<N; i++)
 627                  sum += a[i];
 628
 629    Example2: induction:
 630
 631               loop2:
 632               for (i=0; i<N; i++)
 633                  a[i] = i;  */
 634
 635 static void
 636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 637 {
 638   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 639
 640   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 641
 642   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 643      Reductions in such inner-loop therefore have different properties than
 644      the reductions in the nest that gets vectorized:
 645      1. When vectorized, they are executed in the same order as in the original
 646         scalar loop, so we can't change the order of computation when
 647         vectorizing them.
 648      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 649         current checks are too strict.  */
 650
 651   if (loop->inner)
 652     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 653 }
 654
 655 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 656
 657 static void
 658 vect_fixup_reduc_chain (gimple *stmt)
 659 {
 660   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 661   gimple *stmtp;
 662   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 663               && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 664   REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
 665     = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
 666   do
 667     {
 668       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 669       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 670       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 671       if (stmt)
 672         REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 673           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 674     }
 675   while (stmt);
 676   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 677 }
 678
 679 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 680
 681 static void
 682 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 683 {
 684   gimple *first;
 685   unsigned i;
 686
 687   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 688     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 689       {
 690         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 691         while (next)
 692           {
 693             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 694               break;
 695             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 696           }
 697         /* If not all stmt in the chain are patterns try to handle
 698            the chain without patterns.  */
 699         if (! next)
 700           {
 701             vect_fixup_reduc_chain (first);
 702             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 703               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 704           }
 705       }
 706 }
 707
 708 /* Function vect_get_loop_niters.
 709
 710    Determine how many iterations the loop is executed and place it
 711    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 712    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 713    niter information holds in ASSUMPTIONS.
 714
 715    Return the loop exit condition.  */
 716
 717
 718 static gcond *
 719 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 720                       tree *number_of_iterations, tree *number_of_iterationsm1)
 721 {
 722   edge exit = single_exit (loop);
 723   struct tree_niter_desc niter_desc;
 724   tree niter_assumptions, niter, may_be_zero;
 725   gcond *cond = get_loop_exit_condition (loop);
 726
 727   *assumptions = boolean_true_node;
 728   *number_of_iterationsm1 = chrec_dont_know;
 729   *number_of_iterations = chrec_dont_know;
 730   if (dump_enabled_p ())
 731     dump_printf_loc (MSG_NOTE, vect_location,
 732                      "=== get_loop_niters ===\n");
 733
 734   if (!exit)
 735     return cond;
 736
 737   niter = chrec_dont_know;
 738   may_be_zero = NULL_TREE;
 739   niter_assumptions = boolean_true_node;
 740   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 741       || chrec_contains_undetermined (niter_desc.niter))
 742     return cond;
 743
 744   niter_assumptions = niter_desc.assumptions;
 745   may_be_zero = niter_desc.may_be_zero;
 746   niter = niter_desc.niter;
 747
 748   if (may_be_zero && integer_zerop (may_be_zero))
 749     may_be_zero = NULL_TREE;
 750
 751   if (may_be_zero)
 752     {
 753       if (COMPARISON_CLASS_P (may_be_zero))
 754         {
 755           /* Try to combine may_be_zero with assumptions, this can simplify
 756              computation of niter expression.  */
 757           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 758             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 759                                              niter_assumptions,
 760                                              fold_build1 (TRUTH_NOT_EXPR,
 761                                                           boolean_type_node,
 762                                                           may_be_zero));
 763           else
 764             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 765                                  build_int_cst (TREE_TYPE (niter), 0),
 766                                  rewrite_to_non_trapping_overflow (niter));
 767
 768           may_be_zero = NULL_TREE;
 769         }
 770       else if (integer_nonzerop (may_be_zero))
 771         {
 772           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 773           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 774           return cond;
 775         }
 776       else
 777         return cond;
 778     }
 779
 780   *assumptions = niter_assumptions;
 781   *number_of_iterationsm1 = niter;
 782
 783   /* We want the number of loop header executions which is the number
 784      of latch executions plus one.
 785      ???  For UINT_MAX latch executions this number overflows to zero
 786      for loops like do { n++; } while (n != 0);  */
 787   if (niter && !chrec_contains_undetermined (niter))
 788     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 789                           build_int_cst (TREE_TYPE (niter), 1));
 790   *number_of_iterations = niter;
 791
 792   return cond;
 793 }
 794
 795 /* Function bb_in_loop_p
 796
 797    Used as predicate for dfs order traversal of the loop bbs.  */
 798
 799 static bool
 800 bb_in_loop_p (const_basic_block bb, const void *data)
 801 {
 802   const struct loop *const loop = (const struct loop *)data;
 803   if (flow_bb_inside_loop_p (loop, bb))
 804     return true;
 805   return false;
 806 }
 807
 808
 809 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 810    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 811
 812 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
 813   : vec_info (vec_info::loop, init_cost (loop_in)),
 814     loop (loop_in),
 815     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 816     num_itersm1 (NULL_TREE),
 817     num_iters (NULL_TREE),
 818     num_iters_unchanged (NULL_TREE),
 819     num_iters_assumptions (NULL_TREE),
 820     th (0),
 821     versioning_threshold (0),
 822     vectorization_factor (0),
 823     max_vectorization_factor (0),
 824     mask_skip_niters (NULL_TREE),
 825     mask_compare_type (NULL_TREE),
 826     unaligned_dr (NULL),
 827     peeling_for_alignment (0),
 828     ptr_mask (0),
 829     ivexpr_map (NULL),
 830     slp_unrolling_factor (1),
 831     single_scalar_iteration_cost (0),
 832     vectorizable (false),
 833     can_fully_mask_p (true),
 834     fully_masked_p (false),
 835     peeling_for_gaps (false),
 836     peeling_for_niter (false),
 837     operands_swapped (false),
 838     no_data_dependencies (false),
 839     has_mask_store (false),
 840     scalar_loop (NULL),
 841     orig_loop_info (NULL)
 842 {
 843   /* Create/Update stmt_info for all stmts in the loop.  */
 844   basic_block *body = get_loop_body (loop);
 845   for (unsigned int i = 0; i < loop->num_nodes; i++)
 846     {
 847       basic_block bb = body[i];
 848       gimple_stmt_iterator si;
 849
 850       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 851         {
 852           gimple *phi = gsi_stmt (si);
 853           gimple_set_uid (phi, 0);
 854           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
 855         }
 856
 857       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 858         {
 859           gimple *stmt = gsi_stmt (si);
 860           gimple_set_uid (stmt, 0);
 861           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
 862         }
 863     }
 864   free (body);
 865
 866   /* CHECKME: We want to visit all BBs before their successors (except for
 867      latch blocks, for which this assertion wouldn't hold).  In the simple
 868      case of the loop forms we allow, a dfs order of the BBs would the same
 869      as reversed postorder traversal, so we are safe.  */
 870
 871   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 872                                           bbs, loop->num_nodes, loop);
 873   gcc_assert (nbbs == loop->num_nodes);
 874 }
 875
 876 /* Free all levels of MASKS.  */
 877
 878 void
 879 release_vec_loop_masks (vec_loop_masks *masks)
 880 {
 881   rgroup_masks *rgm;
 882   unsigned int i;
 883   FOR_EACH_VEC_ELT (*masks, i, rgm)
 884     rgm->masks.release ();
 885   masks->release ();
 886 }
 887
 888 /* Free all memory used by the _loop_vec_info, as well as all the
 889    stmt_vec_info structs of all the stmts in the loop.  */
 890
 891 _loop_vec_info::~_loop_vec_info ()
 892 {
 893   int nbbs;
 894   gimple_stmt_iterator si;
 895   int j;
 896
 897   nbbs = loop->num_nodes;
 898   for (j = 0; j < nbbs; j++)
 899     {
 900       basic_block bb = bbs[j];
 901       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 902         free_stmt_vec_info (gsi_stmt (si));
 903
 904       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 905         {
 906           gimple *stmt = gsi_stmt (si);
 907
 908           /* We may have broken canonical form by moving a constant
 909              into RHS1 of a commutative op.  Fix such occurrences.  */
 910           if (operands_swapped && is_gimple_assign (stmt))
 911             {
 912               enum tree_code code = gimple_assign_rhs_code (stmt);
 913
 914               if ((code == PLUS_EXPR
 915                    || code == POINTER_PLUS_EXPR
 916                    || code == MULT_EXPR)
 917                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 918                 swap_ssa_operands (stmt,
 919                                    gimple_assign_rhs1_ptr (stmt),
 920                                    gimple_assign_rhs2_ptr (stmt));
 921               else if (code == COND_EXPR
 922                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 923                 {
 924                   tree cond_expr = gimple_assign_rhs1 (stmt);
 925                   enum tree_code cond_code = TREE_CODE (cond_expr);
 926
 927                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 928                     {
 929                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 930                                                                   0));
 931                       cond_code = invert_tree_comparison (cond_code,
 932                                                           honor_nans);
 933                       if (cond_code != ERROR_MARK)
 934                         {
 935                           TREE_SET_CODE (cond_expr, cond_code);
 936                           swap_ssa_operands (stmt,
 937                                              gimple_assign_rhs2_ptr (stmt),
 938                                              gimple_assign_rhs3_ptr (stmt));
 939                         }
 940                     }
 941                 }
 942             }
 943
 944           /* Free stmt_vec_info.  */
 945           free_stmt_vec_info (stmt);
 946           gsi_next (&si);
 947         }
 948     }
 949
 950   free (bbs);
 951
 952   release_vec_loop_masks (&masks);
 953   delete ivexpr_map;
 954
 955   loop->aux = NULL;
 956 }
 957
 958 /* Return an invariant or register for EXPR and emit necessary
 959    computations in the LOOP_VINFO loop preheader.  */
 960
 961 tree
 962 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 963 {
 964   if (is_gimple_reg (expr)
 965       || is_gimple_min_invariant (expr))
 966     return expr;
 967
 968   if (! loop_vinfo->ivexpr_map)
 969     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 970   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 971   if (! cached)
 972     {
 973       gimple_seq stmts = NULL;
 974       cached = force_gimple_operand (unshare_expr (expr),
 975                                      &stmts, true, NULL_TREE);
 976       if (stmts)
 977         {
 978           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 979           gsi_insert_seq_on_edge_immediate (e, stmts);
 980         }
 981     }
 982   return cached;
 983 }
 984
 985 /* Return true if we can use CMP_TYPE as the comparison type to produce
 986    all masks required to mask LOOP_VINFO.  */
 987
 988 static bool
 989 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 990 {
 991   rgroup_masks *rgm;
 992   unsigned int i;
 993   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 994     if (rgm->mask_type != NULL_TREE
 995         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 996                                             cmp_type, rgm->mask_type,
 997                                             OPTIMIZE_FOR_SPEED))
 998       return false;
 999   return true;
1000 }
1001
1002 /* Calculate the maximum number of scalars per iteration for every
1003    rgroup in LOOP_VINFO.  */
1004
1005 static unsigned int
1006 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1007 {
1008   unsigned int res = 1;
1009   unsigned int i;
1010   rgroup_masks *rgm;
1011   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1012     res = MAX (res, rgm->max_nscalars_per_iter);
1013   return res;
1014 }
1015
1016 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1017    whether we can actually generate the masks required.  Return true if so,
1018    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1019
1020 static bool
1021 vect_verify_full_masking (loop_vec_info loop_vinfo)
1022 {
1023   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1024   unsigned int min_ni_width;
1025
1026   /* Use a normal loop if there are no statements that need masking.
1027      This only happens in rare degenerate cases: it means that the loop
1028      has no loads, no stores, and no live-out values.  */
1029   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1030     return false;
1031
1032   /* Get the maximum number of iterations that is representable
1033      in the counter type.  */
1034   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1035   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1036
1037   /* Get a more refined estimate for the number of iterations.  */
1038   widest_int max_back_edges;
1039   if (max_loop_iterations (loop, &max_back_edges))
1040     max_ni = wi::smin (max_ni, max_back_edges + 1);
1041
1042   /* Account for rgroup masks, in which each bit is replicated N times.  */
1043   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1044
1045   /* Work out how many bits we need to represent the limit.  */
1046   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1047
1048   /* Find a scalar mode for which WHILE_ULT is supported.  */
1049   opt_scalar_int_mode cmp_mode_iter;
1050   tree cmp_type = NULL_TREE;
1051   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1052     {
1053       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1054       if (cmp_bits >= min_ni_width
1055           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1056         {
1057           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1058           if (this_type
1059               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1060             {
1061               /* Although we could stop as soon as we find a valid mode,
1062                  it's often better to continue until we hit Pmode, since the
1063                  operands to the WHILE are more likely to be reusable in
1064                  address calculations.  */
1065               cmp_type = this_type;
1066               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1067                 break;
1068             }
1069         }
1070     }
1071
1072   if (!cmp_type)
1073     return false;
1074
1075   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1076   return true;
1077 }
1078
1079 /* Calculate the cost of one scalar iteration of the loop.  */
1080 static void
1081 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1082 {
1083   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1084   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1085   int nbbs = loop->num_nodes, factor;
1086   int innerloop_iters, i;
1087
1088   /* Gather costs for statements in the scalar loop.  */
1089
1090   /* FORNOW.  */
1091   innerloop_iters = 1;
1092   if (loop->inner)
1093     innerloop_iters = 50; /* FIXME */
1094
1095   for (i = 0; i < nbbs; i++)
1096     {
1097       gimple_stmt_iterator si;
1098       basic_block bb = bbs[i];
1099
1100       if (bb->loop_father == loop->inner)
1101         factor = innerloop_iters;
1102       else
1103         factor = 1;
1104
1105       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1106         {
1107           gimple *stmt = gsi_stmt (si);
1108           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1109
1110           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1111             continue;
1112
1113           /* Skip stmts that are not vectorized inside the loop.  */
1114           if (stmt_info
1115               && !STMT_VINFO_RELEVANT_P (stmt_info)
1116               && (!STMT_VINFO_LIVE_P (stmt_info)
1117                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1118               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1119             continue;
1120
1121           vect_cost_for_stmt kind;
1122           if (STMT_VINFO_DATA_REF (stmt_info))
1123             {
1124               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1125                kind = scalar_load;
1126              else
1127                kind = scalar_store;
1128             }
1129           else
1130             kind = scalar_stmt;
1131
1132           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1133                             factor, kind, stmt_info, 0, vect_prologue);
1134         }
1135     }
1136
1137   /* Now accumulate cost.  */
1138   void *target_cost_data = init_cost (loop);
1139   stmt_info_for_cost *si;
1140   int j;
1141   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1142                     j, si)
1143     {
1144       struct _stmt_vec_info *stmt_info
1145         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1146       (void) add_stmt_cost (target_cost_data, si->count,
1147                             si->kind, stmt_info, si->misalign,
1148                             vect_body);
1149     }
1150   unsigned dummy, body_cost = 0;
1151   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1152   destroy_cost_data (target_cost_data);
1153   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1154 }
1155
1156
1157 /* Function vect_analyze_loop_form_1.
1158
1159    Verify that certain CFG restrictions hold, including:
1160    - the loop has a pre-header
1161    - the loop has a single entry and exit
1162    - the loop exit condition is simple enough
1163    - the number of iterations can be analyzed, i.e, a countable loop.  The
1164      niter could be analyzed under some assumptions.  */
1165
1166 bool
1167 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1168                           tree *assumptions, tree *number_of_iterationsm1,
1169                           tree *number_of_iterations, gcond **inner_loop_cond)
1170 {
1171   if (dump_enabled_p ())
1172     dump_printf_loc (MSG_NOTE, vect_location,
1173                      "=== vect_analyze_loop_form ===\n");
1174
1175   /* Different restrictions apply when we are considering an inner-most loop,
1176      vs. an outer (nested) loop.
1177      (FORNOW. May want to relax some of these restrictions in the future).  */
1178
1179   if (!loop->inner)
1180     {
1181       /* Inner-most loop.  We currently require that the number of BBs is
1182          exactly 2 (the header and latch).  Vectorizable inner-most loops
1183          look like this:
1184
1185                         (pre-header)
1186                            |
1187                           header <--------+
1188                            | |            |
1189                            | +--> latch --+
1190                            |
1191                         (exit-bb)  */
1192
1193       if (loop->num_nodes != 2)
1194         {
1195           if (dump_enabled_p ())
1196             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1197                              "not vectorized: control flow in loop.\n");
1198           return false;
1199         }
1200
1201       if (empty_block_p (loop->header))
1202         {
1203           if (dump_enabled_p ())
1204             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1205                              "not vectorized: empty loop.\n");
1206           return false;
1207         }
1208     }
1209   else
1210     {
1211       struct loop *innerloop = loop->inner;
1212       edge entryedge;
1213
1214       /* Nested loop. We currently require that the loop is doubly-nested,
1215          contains a single inner loop, and the number of BBs is exactly 5.
1216          Vectorizable outer-loops look like this:
1217
1218                         (pre-header)
1219                            |
1220                           header <---+
1221                            |         |
1222                           inner-loop |
1223                            |         |
1224                           tail ------+
1225                            |
1226                         (exit-bb)
1227
1228          The inner-loop has the properties expected of inner-most loops
1229          as described above.  */
1230
1231       if ((loop->inner)->inner || (loop->inner)->next)
1232         {
1233           if (dump_enabled_p ())
1234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                              "not vectorized: multiple nested loops.\n");
1236           return false;
1237         }
1238
1239       if (loop->num_nodes != 5)
1240         {
1241           if (dump_enabled_p ())
1242             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1243                              "not vectorized: control flow in loop.\n");
1244           return false;
1245         }
1246
1247       entryedge = loop_preheader_edge (innerloop);
1248       if (entryedge->src != loop->header
1249           || !single_exit (innerloop)
1250           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1251         {
1252           if (dump_enabled_p ())
1253             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254                              "not vectorized: unsupported outerloop form.\n");
1255           return false;
1256         }
1257
1258       /* Analyze the inner-loop.  */
1259       tree inner_niterm1, inner_niter, inner_assumptions;
1260       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1261                                       &inner_assumptions, &inner_niterm1,
1262                                       &inner_niter, NULL)
1263           /* Don't support analyzing niter under assumptions for inner
1264              loop.  */
1265           || !integer_onep (inner_assumptions))
1266         {
1267           if (dump_enabled_p ())
1268             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269                              "not vectorized: Bad inner loop.\n");
1270           return false;
1271         }
1272
1273       if (!expr_invariant_in_loop_p (loop, inner_niter))
1274         {
1275           if (dump_enabled_p ())
1276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1277                              "not vectorized: inner-loop count not"
1278                              " invariant.\n");
1279           return false;
1280         }
1281
1282       if (dump_enabled_p ())
1283         dump_printf_loc (MSG_NOTE, vect_location,
1284                          "Considering outer-loop vectorization.\n");
1285     }
1286
1287   if (!single_exit (loop)
1288       || EDGE_COUNT (loop->header->preds) != 2)
1289     {
1290       if (dump_enabled_p ())
1291         {
1292           if (!single_exit (loop))
1293             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1294                              "not vectorized: multiple exits.\n");
1295           else if (EDGE_COUNT (loop->header->preds) != 2)
1296             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1297                              "not vectorized: too many incoming edges.\n");
1298         }
1299       return false;
1300     }
1301
1302   /* We assume that the loop exit condition is at the end of the loop. i.e,
1303      that the loop is represented as a do-while (with a proper if-guard
1304      before the loop if needed), where the loop header contains all the
1305      executable statements, and the latch is empty.  */
1306   if (!empty_block_p (loop->latch)
1307       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1308     {
1309       if (dump_enabled_p ())
1310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311                          "not vectorized: latch block not empty.\n");
1312       return false;
1313     }
1314
1315   /* Make sure the exit is not abnormal.  */
1316   edge e = single_exit (loop);
1317   if (e->flags & EDGE_ABNORMAL)
1318     {
1319       if (dump_enabled_p ())
1320         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1321                          "not vectorized: abnormal loop exit edge.\n");
1322       return false;
1323     }
1324
1325   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1326                                      number_of_iterationsm1);
1327   if (!*loop_cond)
1328     {
1329       if (dump_enabled_p ())
1330         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331                          "not vectorized: complicated exit condition.\n");
1332       return false;
1333     }
1334
1335   if (integer_zerop (*assumptions)
1336       || !*number_of_iterations
1337       || chrec_contains_undetermined (*number_of_iterations))
1338     {
1339       if (dump_enabled_p ())
1340         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1341                          "not vectorized: number of iterations cannot be "
1342                          "computed.\n");
1343       return false;
1344     }
1345
1346   if (integer_zerop (*number_of_iterations))
1347     {
1348       if (dump_enabled_p ())
1349         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1350                          "not vectorized: number of iterations = 0.\n");
1351       return false;
1352     }
1353
1354   return true;
1355 }
1356
1357 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1358
1359 loop_vec_info
1360 vect_analyze_loop_form (struct loop *loop)
1361 {
1362   tree assumptions, number_of_iterations, number_of_iterationsm1;
1363   gcond *loop_cond, *inner_loop_cond = NULL;
1364
1365   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1366                                   &assumptions, &number_of_iterationsm1,
1367                                   &number_of_iterations, &inner_loop_cond))
1368     return NULL;
1369
1370   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1371   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1372   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1373   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1374   if (!integer_onep (assumptions))
1375     {
1376       /* We consider to vectorize this loop by versioning it under
1377          some assumptions.  In order to do this, we need to clear
1378          existing information computed by scev and niter analyzer.  */
1379       scev_reset_htab ();
1380       free_numbers_of_iterations_estimates (loop);
1381       /* Also set flag for this loop so that following scev and niter
1382          analysis are done under the assumptions.  */
1383       loop_constraint_set (loop, LOOP_C_FINITE);
1384       /* Also record the assumptions for versioning.  */
1385       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1386     }
1387
1388   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1389     {
1390       if (dump_enabled_p ())
1391         {
1392           dump_printf_loc (MSG_NOTE, vect_location,
1393                            "Symbolic number of iterations is ");
1394           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1395           dump_printf (MSG_NOTE, "\n");
1396         }
1397     }
1398
1399   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1400   if (inner_loop_cond)
1401     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1402       = loop_exit_ctrl_vec_info_type;
1403
1404   gcc_assert (!loop->aux);
1405   loop->aux = loop_vinfo;
1406   return loop_vinfo;
1407 }
1408
1409
1410
1411 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1412    statements update the vectorization factor.  */
1413
1414 static void
1415 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1416 {
1417   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1418   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1419   int nbbs = loop->num_nodes;
1420   poly_uint64 vectorization_factor;
1421   int i;
1422
1423   if (dump_enabled_p ())
1424     dump_printf_loc (MSG_NOTE, vect_location,
1425                      "=== vect_update_vf_for_slp ===\n");
1426
1427   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1428   gcc_assert (known_ne (vectorization_factor, 0U));
1429
1430   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1431      vectorization factor of the loop is the unrolling factor required by
1432      the SLP instances.  If that unrolling factor is 1, we say, that we
1433      perform pure SLP on loop - cross iteration parallelism is not
1434      exploited.  */
1435   bool only_slp_in_loop = true;
1436   for (i = 0; i < nbbs; i++)
1437     {
1438       basic_block bb = bbs[i];
1439       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1440            gsi_next (&si))
1441         {
1442           gimple *stmt = gsi_stmt (si);
1443           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1444           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1445               && STMT_VINFO_RELATED_STMT (stmt_info))
1446             {
1447               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1448               stmt_info = vinfo_for_stmt (stmt);
1449             }
1450           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1451                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1452               && !PURE_SLP_STMT (stmt_info))
1453             /* STMT needs both SLP and loop-based vectorization.  */
1454             only_slp_in_loop = false;
1455         }
1456     }
1457
1458   if (only_slp_in_loop)
1459     {
1460       dump_printf_loc (MSG_NOTE, vect_location,
1461                        "Loop contains only SLP stmts\n");
1462       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1463     }
1464   else
1465     {
1466       dump_printf_loc (MSG_NOTE, vect_location,
1467                        "Loop contains SLP and non-SLP stmts\n");
1468       /* Both the vectorization factor and unroll factor have the form
1469          current_vector_size * X for some rational X, so they must have
1470          a common multiple.  */
1471       vectorization_factor
1472         = force_common_multiple (vectorization_factor,
1473                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1474     }
1475
1476   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1477   if (dump_enabled_p ())
1478     {
1479       dump_printf_loc (MSG_NOTE, vect_location,
1480                        "Updating vectorization factor to ");
1481       dump_dec (MSG_NOTE, vectorization_factor);
1482       dump_printf (MSG_NOTE, ".\n");
1483     }
1484 }
1485
1486 /* Return true if STMT_INFO describes a double reduction phi and if
1487    the other phi in the reduction is also relevant for vectorization.
1488    This rejects cases such as:
1489
1490       outer1:
1491         x_1 = PHI <x_3(outer2), ...>;
1492         ...
1493
1494       inner:
1495         x_2 = ...;
1496         ...
1497
1498       outer2:
1499         x_3 = PHI <x_2(inner)>;
1500
1501    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1502
1503 static bool
1504 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1505 {
1506   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1507     return false;
1508
1509   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1510   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1511 }
1512
1513 /* Function vect_analyze_loop_operations.
1514
1515    Scan the loop stmts and make sure they are all vectorizable.  */
1516
1517 static bool
1518 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1519 {
1520   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1521   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1522   int nbbs = loop->num_nodes;
1523   int i;
1524   stmt_vec_info stmt_info;
1525   bool need_to_vectorize = false;
1526   bool ok;
1527
1528   if (dump_enabled_p ())
1529     dump_printf_loc (MSG_NOTE, vect_location,
1530                      "=== vect_analyze_loop_operations ===\n");
1531
1532   stmt_vector_for_cost cost_vec;
1533   cost_vec.create (2);
1534
1535   for (i = 0; i < nbbs; i++)
1536     {
1537       basic_block bb = bbs[i];
1538
1539       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1540            gsi_next (&si))
1541         {
1542           gphi *phi = si.phi ();
1543           ok = true;
1544
1545           stmt_info = vinfo_for_stmt (phi);
1546           if (dump_enabled_p ())
1547             {
1548               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1549               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1550             }
1551           if (virtual_operand_p (gimple_phi_result (phi)))
1552             continue;
1553
1554           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1555              (i.e., a phi in the tail of the outer-loop).  */
1556           if (! is_loop_header_bb_p (bb))
1557             {
1558               /* FORNOW: we currently don't support the case that these phis
1559                  are not used in the outerloop (unless it is double reduction,
1560                  i.e., this phi is vect_reduction_def), cause this case
1561                  requires to actually do something here.  */
1562               if (STMT_VINFO_LIVE_P (stmt_info)
1563                   && !vect_active_double_reduction_p (stmt_info))
1564                 {
1565                   if (dump_enabled_p ())
1566                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1567                                      "Unsupported loop-closed phi in "
1568                                      "outer-loop.\n");
1569                   return false;
1570                 }
1571
1572               /* If PHI is used in the outer loop, we check that its operand
1573                  is defined in the inner loop.  */
1574               if (STMT_VINFO_RELEVANT_P (stmt_info))
1575                 {
1576                   tree phi_op;
1577                   gimple *op_def_stmt;
1578
1579                   if (gimple_phi_num_args (phi) != 1)
1580                     return false;
1581
1582                   phi_op = PHI_ARG_DEF (phi, 0);
1583                   if (TREE_CODE (phi_op) != SSA_NAME)
1584                     return false;
1585
1586                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1587                   if (gimple_nop_p (op_def_stmt)
1588                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1589                       || !vinfo_for_stmt (op_def_stmt))
1590                     return false;
1591
1592                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1593                         != vect_used_in_outer
1594                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1595                            != vect_used_in_outer_by_reduction)
1596                     return false;
1597                 }
1598
1599               continue;
1600             }
1601
1602           gcc_assert (stmt_info);
1603
1604           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1605                || STMT_VINFO_LIVE_P (stmt_info))
1606               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1607             {
1608               /* A scalar-dependence cycle that we don't support.  */
1609               if (dump_enabled_p ())
1610                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611                                  "not vectorized: scalar dependence cycle.\n");
1612               return false;
1613             }
1614
1615           if (STMT_VINFO_RELEVANT_P (stmt_info))
1616             {
1617               need_to_vectorize = true;
1618               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1619                   && ! PURE_SLP_STMT (stmt_info))
1620                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1621               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1622                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1623                        && ! PURE_SLP_STMT (stmt_info))
1624                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1625                                              &cost_vec);
1626             }
1627
1628           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1629           if (ok
1630               && STMT_VINFO_LIVE_P (stmt_info)
1631               && !PURE_SLP_STMT (stmt_info))
1632             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1633                                               &cost_vec);
1634
1635           if (!ok)
1636             {
1637               if (dump_enabled_p ())
1638                 {
1639                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1640                                    "not vectorized: relevant phi not "
1641                                    "supported: ");
1642                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1643                 }
1644               return false;
1645             }
1646         }
1647
1648       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1649            gsi_next (&si))
1650         {
1651           gimple *stmt = gsi_stmt (si);
1652           if (!gimple_clobber_p (stmt)
1653               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1654                                      &cost_vec))
1655             return false;
1656         }
1657     } /* bbs */
1658
1659   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1660   cost_vec.release ();
1661
1662   /* All operations in the loop are either irrelevant (deal with loop
1663      control, or dead), or only used outside the loop and can be moved
1664      out of the loop (e.g. invariants, inductions).  The loop can be
1665      optimized away by scalar optimizations.  We're better off not
1666      touching this loop.  */
1667   if (!need_to_vectorize)
1668     {
1669       if (dump_enabled_p ())
1670         dump_printf_loc (MSG_NOTE, vect_location,
1671                          "All the computation can be taken out of the loop.\n");
1672       if (dump_enabled_p ())
1673         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1674                          "not vectorized: redundant loop. no profit to "
1675                          "vectorize.\n");
1676       return false;
1677     }
1678
1679   return true;
1680 }
1681
1682 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1683    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1684    definitely no, or -1 if it's worth retrying.  */
1685
1686 static int
1687 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1688 {
1689   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1690   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1691
1692   /* Only fully-masked loops can have iteration counts less than the
1693      vectorization factor.  */
1694   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1695     {
1696       HOST_WIDE_INT max_niter;
1697
1698       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1699         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1700       else
1701         max_niter = max_stmt_executions_int (loop);
1702
1703       if (max_niter != -1
1704           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1705         {
1706           if (dump_enabled_p ())
1707             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1708                              "not vectorized: iteration count smaller than "
1709                              "vectorization factor.\n");
1710           return 0;
1711         }
1712     }
1713
1714   int min_profitable_iters, min_profitable_estimate;
1715   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1716                                       &min_profitable_estimate);
1717
1718   if (min_profitable_iters < 0)
1719     {
1720       if (dump_enabled_p ())
1721         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1722                          "not vectorized: vectorization not profitable.\n");
1723       if (dump_enabled_p ())
1724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725                          "not vectorized: vector version will never be "
1726                          "profitable.\n");
1727       return -1;
1728     }
1729
1730   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1731                                * assumed_vf);
1732
1733   /* Use the cost model only if it is more conservative than user specified
1734      threshold.  */
1735   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1736                                     min_profitable_iters);
1737
1738   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1739
1740   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1741       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1742     {
1743       if (dump_enabled_p ())
1744         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1745                          "not vectorized: vectorization not profitable.\n");
1746       if (dump_enabled_p ())
1747         dump_printf_loc (MSG_NOTE, vect_location,
1748                          "not vectorized: iteration count smaller than user "
1749                          "specified loop bound parameter or minimum profitable "
1750                          "iterations (whichever is more conservative).\n");
1751       return 0;
1752     }
1753
1754   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1755   if (estimated_niter == -1)
1756     estimated_niter = likely_max_stmt_executions_int (loop);
1757   if (estimated_niter != -1
1758       && ((unsigned HOST_WIDE_INT) estimated_niter
1759           < MAX (th, (unsigned) min_profitable_estimate)))
1760     {
1761       if (dump_enabled_p ())
1762         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1763                          "not vectorized: estimated iteration count too "
1764                          "small.\n");
1765       if (dump_enabled_p ())
1766         dump_printf_loc (MSG_NOTE, vect_location,
1767                          "not vectorized: estimated iteration count smaller "
1768                          "than specified loop bound parameter or minimum "
1769                          "profitable iterations (whichever is more "
1770                          "conservative).\n");
1771       return -1;
1772     }
1773
1774   return 1;
1775 }
1776
1777
1778 /* Function vect_analyze_loop_2.
1779
1780    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1781    for it.  The different analyses will record information in the
1782    loop_vec_info struct.  */
1783 static bool
1784 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1785 {
1786   bool ok;
1787   int res;
1788   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1789   poly_uint64 min_vf = 2;
1790   unsigned int n_stmts = 0;
1791
1792   /* The first group of checks is independent of the vector size.  */
1793   fatal = true;
1794
1795   /* Find all data references in the loop (which correspond to vdefs/vuses)
1796      and analyze their evolution in the loop.  */
1797
1798   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1799
1800   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1801   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1802     {
1803       if (dump_enabled_p ())
1804         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1805                          "not vectorized: loop nest containing two "
1806                          "or more consecutive inner loops cannot be "
1807                          "vectorized\n");
1808       return false;
1809     }
1810
1811   for (unsigned i = 0; i < loop->num_nodes; i++)
1812     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1813          !gsi_end_p (gsi); gsi_next (&gsi))
1814       {
1815         gimple *stmt = gsi_stmt (gsi);
1816         if (is_gimple_debug (stmt))
1817           continue;
1818         ++n_stmts;
1819         if (!find_data_references_in_stmt (loop, stmt,
1820                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1821           {
1822             if (is_gimple_call (stmt) && loop->safelen)
1823               {
1824                 tree fndecl = gimple_call_fndecl (stmt), op;
1825                 if (fndecl != NULL_TREE)
1826                   {
1827                     cgraph_node *node = cgraph_node::get (fndecl);
1828                     if (node != NULL && node->simd_clones != NULL)
1829                       {
1830                         unsigned int j, n = gimple_call_num_args (stmt);
1831                         for (j = 0; j < n; j++)
1832                           {
1833                             op = gimple_call_arg (stmt, j);
1834                             if (DECL_P (op)
1835                                 || (REFERENCE_CLASS_P (op)
1836                                     && get_base_address (op)))
1837                               break;
1838                           }
1839                         op = gimple_call_lhs (stmt);
1840                         /* Ignore #pragma omp declare simd functions
1841                            if they don't have data references in the
1842                            call stmt itself.  */
1843                         if (j == n
1844                             && !(op
1845                                  && (DECL_P (op)
1846                                      || (REFERENCE_CLASS_P (op)
1847                                          && get_base_address (op)))))
1848                           continue;
1849                       }
1850                   }
1851               }
1852             if (dump_enabled_p ())
1853               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854                                "not vectorized: loop contains function "
1855                                "calls or data references that cannot "
1856                                "be analyzed\n");
1857             return false;
1858           }
1859       }
1860
1861   /* Analyze the data references and also adjust the minimal
1862      vectorization factor according to the loads and stores.  */
1863
1864   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1865   if (!ok)
1866     {
1867       if (dump_enabled_p ())
1868         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1869                          "bad data references.\n");
1870       return false;
1871     }
1872
1873   /* Classify all cross-iteration scalar data-flow cycles.
1874      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1875   vect_analyze_scalar_cycles (loop_vinfo);
1876
1877   vect_pattern_recog (loop_vinfo);
1878
1879   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1880
1881   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1882      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1883
1884   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1885   if (!ok)
1886     {
1887       if (dump_enabled_p ())
1888         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1889                          "bad data access.\n");
1890       return false;
1891     }
1892
1893   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1894
1895   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1896   if (!ok)
1897     {
1898       if (dump_enabled_p ())
1899         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1900                          "unexpected pattern.\n");
1901       return false;
1902     }
1903
1904   /* While the rest of the analysis below depends on it in some way.  */
1905   fatal = false;
1906
1907   /* Analyze data dependences between the data-refs in the loop
1908      and adjust the maximum vectorization factor according to
1909      the dependences.
1910      FORNOW: fail at the first data dependence that we encounter.  */
1911
1912   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1913   if (!ok
1914       || (max_vf != MAX_VECTORIZATION_FACTOR
1915           && maybe_lt (max_vf, min_vf)))
1916     {
1917       if (dump_enabled_p ())
1918             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1919                              "bad data dependence.\n");
1920       return false;
1921     }
1922   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1923
1924   ok = vect_determine_vectorization_factor (loop_vinfo);
1925   if (!ok)
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "can't determine vectorization factor.\n");
1930       return false;
1931     }
1932   if (max_vf != MAX_VECTORIZATION_FACTOR
1933       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1934     {
1935       if (dump_enabled_p ())
1936         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937                          "bad data dependence.\n");
1938       return false;
1939     }
1940
1941   /* Compute the scalar iteration cost.  */
1942   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1943
1944   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1945   unsigned th;
1946
1947   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1948   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1949   if (!ok)
1950     return false;
1951
1952   /* If there are any SLP instances mark them as pure_slp.  */
1953   bool slp = vect_make_slp_decision (loop_vinfo);
1954   if (slp)
1955     {
1956       /* Find stmts that need to be both vectorized and SLPed.  */
1957       vect_detect_hybrid_slp (loop_vinfo);
1958
1959       /* Update the vectorization factor based on the SLP decision.  */
1960       vect_update_vf_for_slp (loop_vinfo);
1961     }
1962
1963   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1964
1965   /* We don't expect to have to roll back to anything other than an empty
1966      set of rgroups.  */
1967   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1968
1969   /* This is the point where we can re-start analysis with SLP forced off.  */
1970 start_over:
1971
1972   /* Now the vectorization factor is final.  */
1973   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1974   gcc_assert (known_ne (vectorization_factor, 0U));
1975
1976   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1977     {
1978       dump_printf_loc (MSG_NOTE, vect_location,
1979                        "vectorization_factor = ");
1980       dump_dec (MSG_NOTE, vectorization_factor);
1981       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1982                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1983     }
1984
1985   HOST_WIDE_INT max_niter
1986     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1987
1988   /* Analyze the alignment of the data-refs in the loop.
1989      Fail if a data reference is found that cannot be vectorized.  */
1990
1991   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1992   if (!ok)
1993     {
1994       if (dump_enabled_p ())
1995         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1996                          "bad data alignment.\n");
1997       return false;
1998     }
1999
2000   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2001      It is important to call pruning after vect_analyze_data_ref_accesses,
2002      since we use grouping information gathered by interleaving analysis.  */
2003   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2004   if (!ok)
2005     return false;
2006
2007   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2008      vectorization.  */
2009   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2010     {
2011     /* This pass will decide on using loop versioning and/or loop peeling in
2012        order to enhance the alignment of data references in the loop.  */
2013     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2014     if (!ok)
2015       {
2016         if (dump_enabled_p ())
2017           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                            "bad data alignment.\n");
2019         return false;
2020       }
2021     }
2022
2023   if (slp)
2024     {
2025       /* Analyze operations in the SLP instances.  Note this may
2026          remove unsupported SLP instances which makes the above
2027          SLP kind detection invalid.  */
2028       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2029       vect_slp_analyze_operations (loop_vinfo);
2030       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2031         goto again;
2032     }
2033
2034   /* Scan all the remaining operations in the loop that are not subject
2035      to SLP and make sure they are vectorizable.  */
2036   ok = vect_analyze_loop_operations (loop_vinfo);
2037   if (!ok)
2038     {
2039       if (dump_enabled_p ())
2040         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2041                          "bad operation or unsupported loop bound.\n");
2042       return false;
2043     }
2044
2045   /* Decide whether to use a fully-masked loop for this vectorization
2046      factor.  */
2047   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2048     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2049        && vect_verify_full_masking (loop_vinfo));
2050   if (dump_enabled_p ())
2051     {
2052       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2053         dump_printf_loc (MSG_NOTE, vect_location,
2054                          "using a fully-masked loop.\n");
2055       else
2056         dump_printf_loc (MSG_NOTE, vect_location,
2057                          "not using a fully-masked loop.\n");
2058     }
2059
2060   /* If epilog loop is required because of data accesses with gaps,
2061      one additional iteration needs to be peeled.  Check if there is
2062      enough iterations for vectorization.  */
2063   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2064       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2065       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2066     {
2067       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2068       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2069
2070       if (known_lt (wi::to_widest (scalar_niters), vf))
2071         {
2072           if (dump_enabled_p ())
2073             dump_printf_loc (MSG_NOTE, vect_location,
2074                              "loop has no enough iterations to support"
2075                              " peeling for gaps.\n");
2076           return false;
2077         }
2078     }
2079
2080   /* Check the costings of the loop make vectorizing worthwhile.  */
2081   res = vect_analyze_loop_costing (loop_vinfo);
2082   if (res < 0)
2083     goto again;
2084   if (!res)
2085     {
2086       if (dump_enabled_p ())
2087         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2088                          "Loop costings not worthwhile.\n");
2089       return false;
2090     }
2091
2092   /* Decide whether we need to create an epilogue loop to handle
2093      remaining scalar iterations.  */
2094   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2095
2096   unsigned HOST_WIDE_INT const_vf;
2097   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2098     /* The main loop handles all iterations.  */
2099     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2100   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2101            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2102     {
2103       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2104                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2105                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2106         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2107     }
2108   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2109            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2110            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2111                 < (unsigned) exact_log2 (const_vf))
2112                /* In case of versioning, check if the maximum number of
2113                   iterations is greater than th.  If they are identical,
2114                   the epilogue is unnecessary.  */
2115                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2116                    || ((unsigned HOST_WIDE_INT) max_niter
2117                        > (th / const_vf) * const_vf))))
2118     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2119
2120   /* If an epilogue loop is required make sure we can create one.  */
2121   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2122       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2123     {
2124       if (dump_enabled_p ())
2125         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2126       if (!vect_can_advance_ivs_p (loop_vinfo)
2127           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2128                                            single_exit (LOOP_VINFO_LOOP
2129                                                          (loop_vinfo))))
2130         {
2131           if (dump_enabled_p ())
2132             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2133                              "not vectorized: can't create required "
2134                              "epilog loop\n");
2135           goto again;
2136         }
2137     }
2138
2139   /* During peeling, we need to check if number of loop iterations is
2140      enough for both peeled prolog loop and vector loop.  This check
2141      can be merged along with threshold check of loop versioning, so
2142      increase threshold for this case if necessary.  */
2143   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2144     {
2145       poly_uint64 niters_th = 0;
2146
2147       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2148         {
2149           /* Niters for peeled prolog loop.  */
2150           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2151             {
2152               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2153               tree vectype
2154                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2155               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2156             }
2157           else
2158             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2159         }
2160
2161       /* Niters for at least one iteration of vectorized loop.  */
2162       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2163         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2164       /* One additional iteration because of peeling for gap.  */
2165       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2166         niters_th += 1;
2167       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2168     }
2169
2170   gcc_assert (known_eq (vectorization_factor,
2171                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2172
2173   /* Ok to vectorize!  */
2174   return true;
2175
2176 again:
2177   /* Try again with SLP forced off but if we didn't do any SLP there is
2178      no point in re-trying.  */
2179   if (!slp)
2180     return false;
2181
2182   /* If there are reduction chains re-trying will fail anyway.  */
2183   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2184     return false;
2185
2186   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2187      via interleaving or lane instructions.  */
2188   slp_instance instance;
2189   slp_tree node;
2190   unsigned i, j;
2191   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2192     {
2193       stmt_vec_info vinfo;
2194       vinfo = vinfo_for_stmt
2195           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2196       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2197         continue;
2198       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2199       unsigned int size = DR_GROUP_SIZE (vinfo);
2200       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2201       if (! vect_store_lanes_supported (vectype, size, false)
2202          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2203          && ! vect_grouped_store_supported (vectype, size))
2204        return false;
2205       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2206         {
2207           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2208           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2209           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2210           size = DR_GROUP_SIZE (vinfo);
2211           vectype = STMT_VINFO_VECTYPE (vinfo);
2212           if (! vect_load_lanes_supported (vectype, size, false)
2213               && ! vect_grouped_load_supported (vectype, single_element_p,
2214                                                 size))
2215             return false;
2216         }
2217     }
2218
2219   if (dump_enabled_p ())
2220     dump_printf_loc (MSG_NOTE, vect_location,
2221                      "re-trying with SLP disabled\n");
2222
2223   /* Roll back state appropriately.  No SLP this time.  */
2224   slp = false;
2225   /* Restore vectorization factor as it were without SLP.  */
2226   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2227   /* Free the SLP instances.  */
2228   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2229     vect_free_slp_instance (instance);
2230   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2231   /* Reset SLP type to loop_vect on all stmts.  */
2232   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2233     {
2234       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2235       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2236            !gsi_end_p (si); gsi_next (&si))
2237         {
2238           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2239           STMT_SLP_TYPE (stmt_info) = loop_vect;
2240         }
2241       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2242            !gsi_end_p (si); gsi_next (&si))
2243         {
2244           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2245           STMT_SLP_TYPE (stmt_info) = loop_vect;
2246           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2247             {
2248               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2249               STMT_SLP_TYPE (stmt_info) = loop_vect;
2250               for (gimple_stmt_iterator pi
2251                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2252                    !gsi_end_p (pi); gsi_next (&pi))
2253                 {
2254                   gimple *pstmt = gsi_stmt (pi);
2255                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2256                 }
2257             }
2258         }
2259     }
2260   /* Free optimized alias test DDRS.  */
2261   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2262   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2263   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2264   /* Reset target cost data.  */
2265   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2266   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2267     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2268   /* Reset accumulated rgroup information.  */
2269   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2270   /* Reset assorted flags.  */
2271   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2272   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2273   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2274   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2275   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2276
2277   goto start_over;
2278 }
2279
2280 /* Function vect_analyze_loop.
2281
2282    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2283    for it.  The different analyses will record information in the
2284    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2285    be vectorized.  */
2286 loop_vec_info
2287 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2288 {
2289   loop_vec_info loop_vinfo;
2290   auto_vector_sizes vector_sizes;
2291
2292   /* Autodetect first vector size we try.  */
2293   current_vector_size = 0;
2294   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2295   unsigned int next_size = 0;
2296
2297   if (dump_enabled_p ())
2298     dump_printf_loc (MSG_NOTE, vect_location,
2299                      "===== analyze_loop_nest =====\n");
2300
2301   if (loop_outer (loop)
2302       && loop_vec_info_for_loop (loop_outer (loop))
2303       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2304     {
2305       if (dump_enabled_p ())
2306         dump_printf_loc (MSG_NOTE, vect_location,
2307                          "outer-loop already vectorized.\n");
2308       return NULL;
2309     }
2310
2311   poly_uint64 autodetected_vector_size = 0;
2312   while (1)
2313     {
2314       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2315       loop_vinfo = vect_analyze_loop_form (loop);
2316       if (!loop_vinfo)
2317         {
2318           if (dump_enabled_p ())
2319             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320                              "bad loop form.\n");
2321           return NULL;
2322         }
2323
2324       bool fatal = false;
2325
2326       if (orig_loop_vinfo)
2327         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2328
2329       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2330         {
2331           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2332
2333           return loop_vinfo;
2334         }
2335
2336       delete loop_vinfo;
2337
2338       if (next_size == 0)
2339         autodetected_vector_size = current_vector_size;
2340
2341       if (next_size < vector_sizes.length ()
2342           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2343         next_size += 1;
2344
2345       if (fatal
2346           || next_size == vector_sizes.length ()
2347           || known_eq (current_vector_size, 0U))
2348         return NULL;
2349
2350       /* Try the next biggest vector size.  */
2351       current_vector_size = vector_sizes[next_size++];
2352       if (dump_enabled_p ())
2353         {
2354           dump_printf_loc (MSG_NOTE, vect_location,
2355                            "***** Re-trying analysis with "
2356                            "vector size ");
2357           dump_dec (MSG_NOTE, current_vector_size);
2358           dump_printf (MSG_NOTE, "\n");
2359         }
2360     }
2361 }
2362
2363 /* Return true if there is an in-order reduction function for CODE, storing
2364    it in *REDUC_FN if so.  */
2365
2366 static bool
2367 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2368 {
2369   switch (code)
2370     {
2371     case PLUS_EXPR:
2372       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2373       return true;
2374
2375     default:
2376       return false;
2377     }
2378 }
2379
2380 /* Function reduction_fn_for_scalar_code
2381
2382    Input:
2383    CODE - tree_code of a reduction operations.
2384
2385    Output:
2386    REDUC_FN - the corresponding internal function to be used to reduce the
2387       vector of partial results into a single scalar result, or IFN_LAST
2388       if the operation is a supported reduction operation, but does not have
2389       such an internal function.
2390
2391    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2392
2393 static bool
2394 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2395 {
2396   switch (code)
2397     {
2398       case MAX_EXPR:
2399         *reduc_fn = IFN_REDUC_MAX;
2400         return true;
2401
2402       case MIN_EXPR:
2403         *reduc_fn = IFN_REDUC_MIN;
2404         return true;
2405
2406       case PLUS_EXPR:
2407         *reduc_fn = IFN_REDUC_PLUS;
2408         return true;
2409
2410       case BIT_AND_EXPR:
2411         *reduc_fn = IFN_REDUC_AND;
2412         return true;
2413
2414       case BIT_IOR_EXPR:
2415         *reduc_fn = IFN_REDUC_IOR;
2416         return true;
2417
2418       case BIT_XOR_EXPR:
2419         *reduc_fn = IFN_REDUC_XOR;
2420         return true;
2421
2422       case MULT_EXPR:
2423       case MINUS_EXPR:
2424         *reduc_fn = IFN_LAST;
2425         return true;
2426
2427       default:
2428        return false;
2429     }
2430 }
2431
2432 /* If there is a neutral value X such that SLP reduction NODE would not
2433    be affected by the introduction of additional X elements, return that X,
2434    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2435    is true if the SLP statements perform a single reduction, false if each
2436    statement performs an independent reduction.  */
2437
2438 static tree
2439 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2440                               bool reduc_chain)
2441 {
2442   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2443   gimple *stmt = stmts[0];
2444   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2445   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2446   tree scalar_type = TREE_TYPE (vector_type);
2447   struct loop *loop = gimple_bb (stmt)->loop_father;
2448   gcc_assert (loop);
2449
2450   switch (code)
2451     {
2452     case WIDEN_SUM_EXPR:
2453     case DOT_PROD_EXPR:
2454     case SAD_EXPR:
2455     case PLUS_EXPR:
2456     case MINUS_EXPR:
2457     case BIT_IOR_EXPR:
2458     case BIT_XOR_EXPR:
2459       return build_zero_cst (scalar_type);
2460
2461     case MULT_EXPR:
2462       return build_one_cst (scalar_type);
2463
2464     case BIT_AND_EXPR:
2465       return build_all_ones_cst (scalar_type);
2466
2467     case MAX_EXPR:
2468     case MIN_EXPR:
2469       /* For MIN/MAX the initial values are neutral.  A reduction chain
2470          has only a single initial value, so that value is neutral for
2471          all statements.  */
2472       if (reduc_chain)
2473         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2474       return NULL_TREE;
2475
2476     default:
2477       return NULL_TREE;
2478     }
2479 }
2480
2481 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2482    STMT is printed with a message MSG. */
2483
2484 static void
2485 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2486 {
2487   dump_printf_loc (msg_type, vect_location, "%s", msg);
2488   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2489 }
2490
2491
2492 /* Detect SLP reduction of the form:
2493
2494    #a1 = phi <a5, a0>
2495    a2 = operation (a1)
2496    a3 = operation (a2)
2497    a4 = operation (a3)
2498    a5 = operation (a4)
2499
2500    #a = phi <a5>
2501
2502    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2503    FIRST_STMT is the first reduction stmt in the chain
2504    (a2 = operation (a1)).
2505
2506    Return TRUE if a reduction chain was detected.  */
2507
2508 static bool
2509 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2510                        gimple *first_stmt)
2511 {
2512   struct loop *loop = (gimple_bb (phi))->loop_father;
2513   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2514   enum tree_code code;
2515   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2516   stmt_vec_info use_stmt_info, current_stmt_info;
2517   tree lhs;
2518   imm_use_iterator imm_iter;
2519   use_operand_p use_p;
2520   int nloop_uses, size = 0, n_out_of_loop_uses;
2521   bool found = false;
2522
2523   if (loop != vect_loop)
2524     return false;
2525
2526   lhs = PHI_RESULT (phi);
2527   code = gimple_assign_rhs_code (first_stmt);
2528   while (1)
2529     {
2530       nloop_uses = 0;
2531       n_out_of_loop_uses = 0;
2532       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2533         {
2534           gimple *use_stmt = USE_STMT (use_p);
2535           if (is_gimple_debug (use_stmt))
2536             continue;
2537
2538           /* Check if we got back to the reduction phi.  */
2539           if (use_stmt == phi)
2540             {
2541               loop_use_stmt = use_stmt;
2542               found = true;
2543               break;
2544             }
2545
2546           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2547             {
2548               loop_use_stmt = use_stmt;
2549               nloop_uses++;
2550             }
2551            else
2552              n_out_of_loop_uses++;
2553
2554            /* There are can be either a single use in the loop or two uses in
2555               phi nodes.  */
2556            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2557              return false;
2558         }
2559
2560       if (found)
2561         break;
2562
2563       /* We reached a statement with no loop uses.  */
2564       if (nloop_uses == 0)
2565         return false;
2566
2567       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2568       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2569         return false;
2570
2571       if (!is_gimple_assign (loop_use_stmt)
2572           || code != gimple_assign_rhs_code (loop_use_stmt)
2573           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2574         return false;
2575
2576       /* Insert USE_STMT into reduction chain.  */
2577       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2578       if (current_stmt)
2579         {
2580           current_stmt_info = vinfo_for_stmt (current_stmt);
2581           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2582           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2583             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2584         }
2585       else
2586         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2587
2588       lhs = gimple_assign_lhs (loop_use_stmt);
2589       current_stmt = loop_use_stmt;
2590       size++;
2591    }
2592
2593   if (!found || loop_use_stmt != phi || size < 2)
2594     return false;
2595
2596   /* Swap the operands, if needed, to make the reduction operand be the second
2597      operand.  */
2598   lhs = PHI_RESULT (phi);
2599   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2600   while (next_stmt)
2601     {
2602       if (gimple_assign_rhs2 (next_stmt) == lhs)
2603         {
2604           tree op = gimple_assign_rhs1 (next_stmt);
2605           gimple *def_stmt = NULL;
2606
2607           if (TREE_CODE (op) == SSA_NAME)
2608             def_stmt = SSA_NAME_DEF_STMT (op);
2609
2610           /* Check that the other def is either defined in the loop
2611              ("vect_internal_def"), or it's an induction (defined by a
2612              loop-header phi-node).  */
2613           if (def_stmt
2614               && gimple_bb (def_stmt)
2615               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2616               && (is_gimple_assign (def_stmt)
2617                   || is_gimple_call (def_stmt)
2618                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2619                            == vect_induction_def
2620                   || (gimple_code (def_stmt) == GIMPLE_PHI
2621                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2622                                   == vect_internal_def
2623                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2624             {
2625               lhs = gimple_assign_lhs (next_stmt);
2626               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2627               continue;
2628             }
2629
2630           return false;
2631         }
2632       else
2633         {
2634           tree op = gimple_assign_rhs2 (next_stmt);
2635           gimple *def_stmt = NULL;
2636
2637           if (TREE_CODE (op) == SSA_NAME)
2638             def_stmt = SSA_NAME_DEF_STMT (op);
2639
2640           /* Check that the other def is either defined in the loop
2641             ("vect_internal_def"), or it's an induction (defined by a
2642             loop-header phi-node).  */
2643           if (def_stmt
2644               && gimple_bb (def_stmt)
2645               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2646               && (is_gimple_assign (def_stmt)
2647                   || is_gimple_call (def_stmt)
2648                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2649                               == vect_induction_def
2650                   || (gimple_code (def_stmt) == GIMPLE_PHI
2651                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2652                                   == vect_internal_def
2653                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2654             {
2655               if (dump_enabled_p ())
2656                 {
2657                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2658                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2659                 }
2660
2661               swap_ssa_operands (next_stmt,
2662                                  gimple_assign_rhs1_ptr (next_stmt),
2663                                  gimple_assign_rhs2_ptr (next_stmt));
2664               update_stmt (next_stmt);
2665
2666               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2667                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2668             }
2669           else
2670             return false;
2671         }
2672
2673       lhs = gimple_assign_lhs (next_stmt);
2674       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2675     }
2676
2677   /* Save the chain for further analysis in SLP detection.  */
2678   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2679   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2680   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2681
2682   return true;
2683 }
2684
2685 /* Return true if we need an in-order reduction for operation CODE
2686    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2687    overflow must wrap.  */
2688
2689 static bool
2690 needs_fold_left_reduction_p (tree type, tree_code code,
2691                              bool need_wrapping_integral_overflow)
2692 {
2693   /* CHECKME: check for !flag_finite_math_only too?  */
2694   if (SCALAR_FLOAT_TYPE_P (type))
2695     switch (code)
2696       {
2697       case MIN_EXPR:
2698       case MAX_EXPR:
2699         return false;
2700
2701       default:
2702         return !flag_associative_math;
2703       }
2704
2705   if (INTEGRAL_TYPE_P (type))
2706     {
2707       if (!operation_no_trapping_overflow (type, code))
2708         return true;
2709       if (need_wrapping_integral_overflow
2710           && !TYPE_OVERFLOW_WRAPS (type)
2711           && operation_can_overflow (code))
2712         return true;
2713       return false;
2714     }
2715
2716   if (SAT_FIXED_POINT_TYPE_P (type))
2717     return true;
2718
2719   return false;
2720 }
2721
2722 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2723    reduction operation CODE has a handled computation expression.  */
2724
2725 bool
2726 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2727                       enum tree_code code)
2728 {
2729   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2730   auto_bitmap visited;
2731   tree lookfor = PHI_RESULT (phi);
2732   ssa_op_iter curri;
2733   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2734   while (USE_FROM_PTR (curr) != loop_arg)
2735     curr = op_iter_next_use (&curri);
2736   curri.i = curri.numops;
2737   do
2738     {
2739       path.safe_push (std::make_pair (curri, curr));
2740       tree use = USE_FROM_PTR (curr);
2741       if (use == lookfor)
2742         break;
2743       gimple *def = SSA_NAME_DEF_STMT (use);
2744       if (gimple_nop_p (def)
2745           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2746         {
2747 pop:
2748           do
2749             {
2750               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2751               curri = x.first;
2752               curr = x.second;
2753               do
2754                 curr = op_iter_next_use (&curri);
2755               /* Skip already visited or non-SSA operands (from iterating
2756                  over PHI args).  */
2757               while (curr != NULL_USE_OPERAND_P
2758                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2759                          || ! bitmap_set_bit (visited,
2760                                               SSA_NAME_VERSION
2761                                                 (USE_FROM_PTR (curr)))));
2762             }
2763           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2764           if (curr == NULL_USE_OPERAND_P)
2765             break;
2766         }
2767       else
2768         {
2769           if (gimple_code (def) == GIMPLE_PHI)
2770             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2771           else
2772             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2773           while (curr != NULL_USE_OPERAND_P
2774                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2775                      || ! bitmap_set_bit (visited,
2776                                           SSA_NAME_VERSION
2777                                             (USE_FROM_PTR (curr)))))
2778             curr = op_iter_next_use (&curri);
2779           if (curr == NULL_USE_OPERAND_P)
2780             goto pop;
2781         }
2782     }
2783   while (1);
2784   if (dump_file && (dump_flags & TDF_DETAILS))
2785     {
2786       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2787       unsigned i;
2788       std::pair<ssa_op_iter, use_operand_p> *x;
2789       FOR_EACH_VEC_ELT (path, i, x)
2790         {
2791           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2792           dump_printf (MSG_NOTE, " ");
2793         }
2794       dump_printf (MSG_NOTE, "\n");
2795     }
2796
2797   /* Check whether the reduction path detected is valid.  */
2798   bool fail = path.length () == 0;
2799   bool neg = false;
2800   for (unsigned i = 1; i < path.length (); ++i)
2801     {
2802       gimple *use_stmt = USE_STMT (path[i].second);
2803       tree op = USE_FROM_PTR (path[i].second);
2804       if (! has_single_use (op)
2805           || ! is_gimple_assign (use_stmt))
2806         {
2807           fail = true;
2808           break;
2809         }
2810       if (gimple_assign_rhs_code (use_stmt) != code)
2811         {
2812           if (code == PLUS_EXPR
2813               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2814             {
2815               /* Track whether we negate the reduction value each iteration.  */
2816               if (gimple_assign_rhs2 (use_stmt) == op)
2817                 neg = ! neg;
2818             }
2819           else
2820             {
2821               fail = true;
2822               break;
2823             }
2824         }
2825     }
2826   return ! fail && ! neg;
2827 }
2828
2829
2830 /* Function vect_is_simple_reduction
2831
2832    (1) Detect a cross-iteration def-use cycle that represents a simple
2833    reduction computation.  We look for the following pattern:
2834
2835    loop_header:
2836      a1 = phi < a0, a2 >
2837      a3 = ...
2838      a2 = operation (a3, a1)
2839
2840    or
2841
2842    a3 = ...
2843    loop_header:
2844      a1 = phi < a0, a2 >
2845      a2 = operation (a3, a1)
2846
2847    such that:
2848    1. operation is commutative and associative and it is safe to
2849       change the order of the computation
2850    2. no uses for a2 in the loop (a2 is used out of the loop)
2851    3. no uses of a1 in the loop besides the reduction operation
2852    4. no uses of a1 outside the loop.
2853
2854    Conditions 1,4 are tested here.
2855    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2856
2857    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2858    nested cycles.
2859
2860    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2861    reductions:
2862
2863      a1 = phi < a0, a2 >
2864      inner loop (def of a3)
2865      a2 = phi < a3 >
2866
2867    (4) Detect condition expressions, ie:
2868      for (int i = 0; i < N; i++)
2869        if (a[i] < val)
2870         ret_val = a[i];
2871
2872 */
2873
2874 static gimple *
2875 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2876                           bool *double_reduc,
2877                           bool need_wrapping_integral_overflow,
2878                           enum vect_reduction_type *v_reduc_type)
2879 {
2880   struct loop *loop = (gimple_bb (phi))->loop_father;
2881   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2882   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2883   enum tree_code orig_code, code;
2884   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2885   tree type;
2886   int nloop_uses;
2887   tree name;
2888   imm_use_iterator imm_iter;
2889   use_operand_p use_p;
2890   bool phi_def;
2891
2892   *double_reduc = false;
2893   *v_reduc_type = TREE_CODE_REDUCTION;
2894
2895   tree phi_name = PHI_RESULT (phi);
2896   /* ???  If there are no uses of the PHI result the inner loop reduction
2897      won't be detected as possibly double-reduction by vectorizable_reduction
2898      because that tries to walk the PHI arg from the preheader edge which
2899      can be constant.  See PR60382.  */
2900   if (has_zero_uses (phi_name))
2901     return NULL;
2902   nloop_uses = 0;
2903   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2904     {
2905       gimple *use_stmt = USE_STMT (use_p);
2906       if (is_gimple_debug (use_stmt))
2907         continue;
2908
2909       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2910         {
2911           if (dump_enabled_p ())
2912             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2913                              "intermediate value used outside loop.\n");
2914
2915           return NULL;
2916         }
2917
2918       nloop_uses++;
2919       if (nloop_uses > 1)
2920         {
2921           if (dump_enabled_p ())
2922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2923                              "reduction value used in loop.\n");
2924           return NULL;
2925         }
2926
2927       phi_use_stmt = use_stmt;
2928     }
2929
2930   edge latch_e = loop_latch_edge (loop);
2931   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2932   if (TREE_CODE (loop_arg) != SSA_NAME)
2933     {
2934       if (dump_enabled_p ())
2935         {
2936           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2937                            "reduction: not ssa_name: ");
2938           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2939           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2940         }
2941       return NULL;
2942     }
2943
2944   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2945   if (is_gimple_assign (def_stmt))
2946     {
2947       name = gimple_assign_lhs (def_stmt);
2948       phi_def = false;
2949     }
2950   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2951     {
2952       name = PHI_RESULT (def_stmt);
2953       phi_def = true;
2954     }
2955   else
2956     {
2957       if (dump_enabled_p ())
2958         {
2959           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2960                            "reduction: unhandled reduction operation: ");
2961           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2962         }
2963       return NULL;
2964     }
2965
2966   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2967     return NULL;
2968
2969   nloop_uses = 0;
2970   auto_vec<gphi *, 3> lcphis;
2971   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2972     {
2973       gimple *use_stmt = USE_STMT (use_p);
2974       if (is_gimple_debug (use_stmt))
2975         continue;
2976       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2977         nloop_uses++;
2978       else
2979         /* We can have more than one loop-closed PHI.  */
2980         lcphis.safe_push (as_a <gphi *> (use_stmt));
2981       if (nloop_uses > 1)
2982         {
2983           if (dump_enabled_p ())
2984             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2985                              "reduction used in loop.\n");
2986           return NULL;
2987         }
2988     }
2989
2990   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2991      defined in the inner loop.  */
2992   if (phi_def)
2993     {
2994       op1 = PHI_ARG_DEF (def_stmt, 0);
2995
2996       if (gimple_phi_num_args (def_stmt) != 1
2997           || TREE_CODE (op1) != SSA_NAME)
2998         {
2999           if (dump_enabled_p ())
3000             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001                              "unsupported phi node definition.\n");
3002
3003           return NULL;
3004         }
3005
3006       def1 = SSA_NAME_DEF_STMT (op1);
3007       if (gimple_bb (def1)
3008           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3009           && loop->inner
3010           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3011           && is_gimple_assign (def1)
3012           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3013         {
3014           if (dump_enabled_p ())
3015             report_vect_op (MSG_NOTE, def_stmt,
3016                             "detected double reduction: ");
3017
3018           *double_reduc = true;
3019           return def_stmt;
3020         }
3021
3022       return NULL;
3023     }
3024
3025   /* If we are vectorizing an inner reduction we are executing that
3026      in the original order only in case we are not dealing with a
3027      double reduction.  */
3028   bool check_reduction = true;
3029   if (flow_loop_nested_p (vect_loop, loop))
3030     {
3031       gphi *lcphi;
3032       unsigned i;
3033       check_reduction = false;
3034       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3035         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3036           {
3037             gimple *use_stmt = USE_STMT (use_p);
3038             if (is_gimple_debug (use_stmt))
3039               continue;
3040             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3041               check_reduction = true;
3042           }
3043     }
3044
3045   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3046   code = orig_code = gimple_assign_rhs_code (def_stmt);
3047
3048   /* We can handle "res -= x[i]", which is non-associative by
3049      simply rewriting this into "res += -x[i]".  Avoid changing
3050      gimple instruction for the first simple tests and only do this
3051      if we're allowed to change code at all.  */
3052   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3053     code = PLUS_EXPR;
3054
3055   if (code == COND_EXPR)
3056     {
3057       if (! nested_in_vect_loop)
3058         *v_reduc_type = COND_REDUCTION;
3059
3060       op3 = gimple_assign_rhs1 (def_stmt);
3061       if (COMPARISON_CLASS_P (op3))
3062         {
3063           op4 = TREE_OPERAND (op3, 1);
3064           op3 = TREE_OPERAND (op3, 0);
3065         }
3066       if (op3 == phi_name || op4 == phi_name)
3067         {
3068           if (dump_enabled_p ())
3069             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3070                             "reduction: condition depends on previous"
3071                             " iteration: ");
3072           return NULL;
3073         }
3074
3075       op1 = gimple_assign_rhs2 (def_stmt);
3076       op2 = gimple_assign_rhs3 (def_stmt);
3077     }
3078   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3079     {
3080       if (dump_enabled_p ())
3081         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3082                         "reduction: not commutative/associative: ");
3083       return NULL;
3084     }
3085   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3086     {
3087       op1 = gimple_assign_rhs1 (def_stmt);
3088       op2 = gimple_assign_rhs2 (def_stmt);
3089     }
3090   else
3091     {
3092       if (dump_enabled_p ())
3093         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3094                         "reduction: not handled operation: ");
3095       return NULL;
3096     }
3097
3098   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3099     {
3100       if (dump_enabled_p ())
3101         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3102                         "reduction: both uses not ssa_names: ");
3103
3104       return NULL;
3105     }
3106
3107   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3108   if ((TREE_CODE (op1) == SSA_NAME
3109        && !types_compatible_p (type,TREE_TYPE (op1)))
3110       || (TREE_CODE (op2) == SSA_NAME
3111           && !types_compatible_p (type, TREE_TYPE (op2)))
3112       || (op3 && TREE_CODE (op3) == SSA_NAME
3113           && !types_compatible_p (type, TREE_TYPE (op3)))
3114       || (op4 && TREE_CODE (op4) == SSA_NAME
3115           && !types_compatible_p (type, TREE_TYPE (op4))))
3116     {
3117       if (dump_enabled_p ())
3118         {
3119           dump_printf_loc (MSG_NOTE, vect_location,
3120                            "reduction: multiple types: operation type: ");
3121           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3122           dump_printf (MSG_NOTE, ", operands types: ");
3123           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3124                              TREE_TYPE (op1));
3125           dump_printf (MSG_NOTE, ",");
3126           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3127                              TREE_TYPE (op2));
3128           if (op3)
3129             {
3130               dump_printf (MSG_NOTE, ",");
3131               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3132                                  TREE_TYPE (op3));
3133             }
3134
3135           if (op4)
3136             {
3137               dump_printf (MSG_NOTE, ",");
3138               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3139                                  TREE_TYPE (op4));
3140             }
3141           dump_printf (MSG_NOTE, "\n");
3142         }
3143
3144       return NULL;
3145     }
3146
3147   /* Check whether it's ok to change the order of the computation.
3148      Generally, when vectorizing a reduction we change the order of the
3149      computation.  This may change the behavior of the program in some
3150      cases, so we need to check that this is ok.  One exception is when
3151      vectorizing an outer-loop: the inner-loop is executed sequentially,
3152      and therefore vectorizing reductions in the inner-loop during
3153      outer-loop vectorization is safe.  */
3154   if (check_reduction
3155       && *v_reduc_type == TREE_CODE_REDUCTION
3156       && needs_fold_left_reduction_p (type, code,
3157                                       need_wrapping_integral_overflow))
3158     *v_reduc_type = FOLD_LEFT_REDUCTION;
3159
3160   /* Reduction is safe. We're dealing with one of the following:
3161      1) integer arithmetic and no trapv
3162      2) floating point arithmetic, and special flags permit this optimization
3163      3) nested cycle (i.e., outer loop vectorization).  */
3164   if (TREE_CODE (op1) == SSA_NAME)
3165     def1 = SSA_NAME_DEF_STMT (op1);
3166
3167   if (TREE_CODE (op2) == SSA_NAME)
3168     def2 = SSA_NAME_DEF_STMT (op2);
3169
3170   if (code != COND_EXPR
3171       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3172     {
3173       if (dump_enabled_p ())
3174         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3175       return NULL;
3176     }
3177
3178   /* Check that one def is the reduction def, defined by PHI,
3179      the other def is either defined in the loop ("vect_internal_def"),
3180      or it's an induction (defined by a loop-header phi-node).  */
3181
3182   if (def2 && def2 == phi
3183       && (code == COND_EXPR
3184           || !def1 || gimple_nop_p (def1)
3185           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3186           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3187               && (is_gimple_assign (def1)
3188                   || is_gimple_call (def1)
3189                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3190                       == vect_induction_def
3191                   || (gimple_code (def1) == GIMPLE_PHI
3192                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3193                           == vect_internal_def
3194                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3195     {
3196       if (dump_enabled_p ())
3197         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3198       return def_stmt;
3199     }
3200
3201   if (def1 && def1 == phi
3202       && (code == COND_EXPR
3203           || !def2 || gimple_nop_p (def2)
3204           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3205           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3206               && (is_gimple_assign (def2)
3207                   || is_gimple_call (def2)
3208                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3209                        == vect_induction_def
3210                   || (gimple_code (def2) == GIMPLE_PHI
3211                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3212                            == vect_internal_def
3213                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3214     {
3215       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3216         {
3217           /* Check if we can swap operands (just for simplicity - so that
3218              the rest of the code can assume that the reduction variable
3219              is always the last (second) argument).  */
3220           if (code == COND_EXPR)
3221             {
3222               /* Swap cond_expr by inverting the condition.  */
3223               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3224               enum tree_code invert_code = ERROR_MARK;
3225               enum tree_code cond_code = TREE_CODE (cond_expr);
3226
3227               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3228                 {
3229                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3230                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3231                 }
3232               if (invert_code != ERROR_MARK)
3233                 {
3234                   TREE_SET_CODE (cond_expr, invert_code);
3235                   swap_ssa_operands (def_stmt,
3236                                      gimple_assign_rhs2_ptr (def_stmt),
3237                                      gimple_assign_rhs3_ptr (def_stmt));
3238                 }
3239               else
3240                 {
3241                   if (dump_enabled_p ())
3242                     report_vect_op (MSG_NOTE, def_stmt,
3243                                     "detected reduction: cannot swap operands "
3244                                     "for cond_expr");
3245                   return NULL;
3246                 }
3247             }
3248           else
3249             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3250                                gimple_assign_rhs2_ptr (def_stmt));
3251
3252           if (dump_enabled_p ())
3253             report_vect_op (MSG_NOTE, def_stmt,
3254                             "detected reduction: need to swap operands: ");
3255
3256           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3257             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3258         }
3259       else
3260         {
3261           if (dump_enabled_p ())
3262             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3263         }
3264
3265       return def_stmt;
3266     }
3267
3268   /* Try to find SLP reduction chain.  */
3269   if (! nested_in_vect_loop
3270       && code != COND_EXPR
3271       && orig_code != MINUS_EXPR
3272       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3273     {
3274       if (dump_enabled_p ())
3275         report_vect_op (MSG_NOTE, def_stmt,
3276                         "reduction: detected reduction chain: ");
3277
3278       return def_stmt;
3279     }
3280
3281   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3282   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3283   while (first)
3284     {
3285       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3286       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3287       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3288       first = next;
3289     }
3290
3291   /* Look for the expression computing loop_arg from loop PHI result.  */
3292   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3293                             code))
3294     return def_stmt;
3295
3296   if (dump_enabled_p ())
3297     {
3298       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3299                       "reduction: unknown pattern: ");
3300     }
3301
3302   return NULL;
3303 }
3304
3305 /* Wrapper around vect_is_simple_reduction, which will modify code
3306    in-place if it enables detection of more reductions.  Arguments
3307    as there.  */
3308
3309 gimple *
3310 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3311                              bool *double_reduc,
3312                              bool need_wrapping_integral_overflow)
3313 {
3314   enum vect_reduction_type v_reduc_type;
3315   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3316                                           need_wrapping_integral_overflow,
3317                                           &v_reduc_type);
3318   if (def)
3319     {
3320       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3321       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3322       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3323       reduc_def_info = vinfo_for_stmt (def);
3324       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3325       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3326     }
3327   return def;
3328 }
3329
3330 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3331 int
3332 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3333                              int *peel_iters_epilogue,
3334                              stmt_vector_for_cost *scalar_cost_vec,
3335                              stmt_vector_for_cost *prologue_cost_vec,
3336                              stmt_vector_for_cost *epilogue_cost_vec)
3337 {
3338   int retval = 0;
3339   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3340
3341   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3342     {
3343       *peel_iters_epilogue = assumed_vf / 2;
3344       if (dump_enabled_p ())
3345         dump_printf_loc (MSG_NOTE, vect_location,
3346                          "cost model: epilogue peel iters set to vf/2 "
3347                          "because loop iterations are unknown .\n");
3348
3349       /* If peeled iterations are known but number of scalar loop
3350          iterations are unknown, count a taken branch per peeled loop.  */
3351       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3352                                  NULL, 0, vect_prologue);
3353       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3354                                  NULL, 0, vect_epilogue);
3355     }
3356   else
3357     {
3358       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3359       peel_iters_prologue = niters < peel_iters_prologue ?
3360                             niters : peel_iters_prologue;
3361       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3362       /* If we need to peel for gaps, but no peeling is required, we have to
3363          peel VF iterations.  */
3364       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3365         *peel_iters_epilogue = assumed_vf;
3366     }
3367
3368   stmt_info_for_cost *si;
3369   int j;
3370   if (peel_iters_prologue)
3371     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3372         {
3373           stmt_vec_info stmt_info
3374             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3375           retval += record_stmt_cost (prologue_cost_vec,
3376                                       si->count * peel_iters_prologue,
3377                                       si->kind, stmt_info, si->misalign,
3378                                       vect_prologue);
3379         }
3380   if (*peel_iters_epilogue)
3381     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3382         {
3383           stmt_vec_info stmt_info
3384             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3385           retval += record_stmt_cost (epilogue_cost_vec,
3386                                       si->count * *peel_iters_epilogue,
3387                                       si->kind, stmt_info, si->misalign,
3388                                       vect_epilogue);
3389         }
3390
3391   return retval;
3392 }
3393
3394 /* Function vect_estimate_min_profitable_iters
3395
3396    Return the number of iterations required for the vector version of the
3397    loop to be profitable relative to the cost of the scalar version of the
3398    loop.
3399
3400    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3401    of iterations for vectorization.  -1 value means loop vectorization
3402    is not profitable.  This returned value may be used for dynamic
3403    profitability check.
3404
3405    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3406    for static check against estimated number of iterations.  */
3407
3408 static void
3409 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3410                                     int *ret_min_profitable_niters,
3411                                     int *ret_min_profitable_estimate)
3412 {
3413   int min_profitable_iters;
3414   int min_profitable_estimate;
3415   int peel_iters_prologue;
3416   int peel_iters_epilogue;
3417   unsigned vec_inside_cost = 0;
3418   int vec_outside_cost = 0;
3419   unsigned vec_prologue_cost = 0;
3420   unsigned vec_epilogue_cost = 0;
3421   int scalar_single_iter_cost = 0;
3422   int scalar_outside_cost = 0;
3423   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3424   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3425   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3426
3427   /* Cost model disabled.  */
3428   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3429     {
3430       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3431       *ret_min_profitable_niters = 0;
3432       *ret_min_profitable_estimate = 0;
3433       return;
3434     }
3435
3436   /* Requires loop versioning tests to handle misalignment.  */
3437   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3438     {
3439       /*  FIXME: Make cost depend on complexity of individual check.  */
3440       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3441       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3442                             vect_prologue);
3443       dump_printf (MSG_NOTE,
3444                    "cost model: Adding cost of checks for loop "
3445                    "versioning to treat misalignment.\n");
3446     }
3447
3448   /* Requires loop versioning with alias checks.  */
3449   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3450     {
3451       /*  FIXME: Make cost depend on complexity of individual check.  */
3452       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3453       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3454                             vect_prologue);
3455       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3456       if (len)
3457         /* Count LEN - 1 ANDs and LEN comparisons.  */
3458         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3459                               NULL, 0, vect_prologue);
3460       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3461       if (len)
3462         {
3463           /* Count LEN - 1 ANDs and LEN comparisons.  */
3464           unsigned int nstmts = len * 2 - 1;
3465           /* +1 for each bias that needs adding.  */
3466           for (unsigned int i = 0; i < len; ++i)
3467             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3468               nstmts += 1;
3469           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3470                                 NULL, 0, vect_prologue);
3471         }
3472       dump_printf (MSG_NOTE,
3473                    "cost model: Adding cost of checks for loop "
3474                    "versioning aliasing.\n");
3475     }
3476
3477   /* Requires loop versioning with niter checks.  */
3478   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3479     {
3480       /*  FIXME: Make cost depend on complexity of individual check.  */
3481       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3482                             vect_prologue);
3483       dump_printf (MSG_NOTE,
3484                    "cost model: Adding cost of checks for loop "
3485                    "versioning niters.\n");
3486     }
3487
3488   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3489     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3490                           vect_prologue);
3491
3492   /* Count statements in scalar loop.  Using this as scalar cost for a single
3493      iteration for now.
3494
3495      TODO: Add outer loop support.
3496
3497      TODO: Consider assigning different costs to different scalar
3498      statements.  */
3499
3500   scalar_single_iter_cost
3501     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3502
3503   /* Add additional cost for the peeled instructions in prologue and epilogue
3504      loop.  (For fully-masked loops there will be no peeling.)
3505
3506      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3507      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3508
3509      TODO: Build an expression that represents peel_iters for prologue and
3510      epilogue to be used in a run-time test.  */
3511
3512   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3513     {
3514       peel_iters_prologue = 0;
3515       peel_iters_epilogue = 0;
3516
3517       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3518         {
3519           /* We need to peel exactly one iteration.  */
3520           peel_iters_epilogue += 1;
3521           stmt_info_for_cost *si;
3522           int j;
3523           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3524                             j, si)
3525             {
3526               struct _stmt_vec_info *stmt_info
3527                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3528               (void) add_stmt_cost (target_cost_data, si->count,
3529                                     si->kind, stmt_info, si->misalign,
3530                                     vect_epilogue);
3531             }
3532         }
3533     }
3534   else if (npeel < 0)
3535     {
3536       peel_iters_prologue = assumed_vf / 2;
3537       dump_printf (MSG_NOTE, "cost model: "
3538                    "prologue peel iters set to vf/2.\n");
3539
3540       /* If peeling for alignment is unknown, loop bound of main loop becomes
3541          unknown.  */
3542       peel_iters_epilogue = assumed_vf / 2;
3543       dump_printf (MSG_NOTE, "cost model: "
3544                    "epilogue peel iters set to vf/2 because "
3545                    "peeling for alignment is unknown.\n");
3546
3547       /* If peeled iterations are unknown, count a taken branch and a not taken
3548          branch per peeled loop. Even if scalar loop iterations are known,
3549          vector iterations are not known since peeled prologue iterations are
3550          not known. Hence guards remain the same.  */
3551       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3552                             NULL, 0, vect_prologue);
3553       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3554                             NULL, 0, vect_prologue);
3555       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3556                             NULL, 0, vect_epilogue);
3557       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3558                             NULL, 0, vect_epilogue);
3559       stmt_info_for_cost *si;
3560       int j;
3561       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3562         {
3563           struct _stmt_vec_info *stmt_info
3564             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3565           (void) add_stmt_cost (target_cost_data,
3566                                 si->count * peel_iters_prologue,
3567                                 si->kind, stmt_info, si->misalign,
3568                                 vect_prologue);
3569           (void) add_stmt_cost (target_cost_data,
3570                                 si->count * peel_iters_epilogue,
3571                                 si->kind, stmt_info, si->misalign,
3572                                 vect_epilogue);
3573         }
3574     }
3575   else
3576     {
3577       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3578       stmt_info_for_cost *si;
3579       int j;
3580       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3581
3582       prologue_cost_vec.create (2);
3583       epilogue_cost_vec.create (2);
3584       peel_iters_prologue = npeel;
3585
3586       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3587                                           &peel_iters_epilogue,
3588                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3589                                             (loop_vinfo),
3590                                           &prologue_cost_vec,
3591                                           &epilogue_cost_vec);
3592
3593       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3594         {
3595           struct _stmt_vec_info *stmt_info
3596             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3597           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3598                                 si->misalign, vect_prologue);
3599         }
3600
3601       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3602         {
3603           struct _stmt_vec_info *stmt_info
3604             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3605           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3606                                 si->misalign, vect_epilogue);
3607         }
3608
3609       prologue_cost_vec.release ();
3610       epilogue_cost_vec.release ();
3611     }
3612
3613   /* FORNOW: The scalar outside cost is incremented in one of the
3614      following ways:
3615
3616      1. The vectorizer checks for alignment and aliasing and generates
3617      a condition that allows dynamic vectorization.  A cost model
3618      check is ANDED with the versioning condition.  Hence scalar code
3619      path now has the added cost of the versioning check.
3620
3621        if (cost > th & versioning_check)
3622          jmp to vector code
3623
3624      Hence run-time scalar is incremented by not-taken branch cost.
3625
3626      2. The vectorizer then checks if a prologue is required.  If the
3627      cost model check was not done before during versioning, it has to
3628      be done before the prologue check.
3629
3630        if (cost <= th)
3631          prologue = scalar_iters
3632        if (prologue == 0)
3633          jmp to vector code
3634        else
3635          execute prologue
3636        if (prologue == num_iters)
3637          go to exit
3638
3639      Hence the run-time scalar cost is incremented by a taken branch,
3640      plus a not-taken branch, plus a taken branch cost.
3641
3642      3. The vectorizer then checks if an epilogue is required.  If the
3643      cost model check was not done before during prologue check, it
3644      has to be done with the epilogue check.
3645
3646        if (prologue == 0)
3647          jmp to vector code
3648        else
3649          execute prologue
3650        if (prologue == num_iters)
3651          go to exit
3652        vector code:
3653          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3654            jmp to epilogue
3655
3656      Hence the run-time scalar cost should be incremented by 2 taken
3657      branches.
3658
3659      TODO: The back end may reorder the BBS's differently and reverse
3660      conditions/branch directions.  Change the estimates below to
3661      something more reasonable.  */
3662
3663   /* If the number of iterations is known and we do not do versioning, we can
3664      decide whether to vectorize at compile time.  Hence the scalar version
3665      do not carry cost model guard costs.  */
3666   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3667       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3668     {
3669       /* Cost model check occurs at versioning.  */
3670       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3671         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3672       else
3673         {
3674           /* Cost model check occurs at prologue generation.  */
3675           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3676             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3677               + vect_get_stmt_cost (cond_branch_not_taken);
3678           /* Cost model check occurs at epilogue generation.  */
3679           else
3680             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3681         }
3682     }
3683
3684   /* Complete the target-specific cost calculations.  */
3685   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3686                &vec_inside_cost, &vec_epilogue_cost);
3687
3688   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3689
3690   if (dump_enabled_p ())
3691     {
3692       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3693       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3694                    vec_inside_cost);
3695       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3696                    vec_prologue_cost);
3697       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3698                    vec_epilogue_cost);
3699       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3700                    scalar_single_iter_cost);
3701       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3702                    scalar_outside_cost);
3703       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3704                    vec_outside_cost);
3705       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3706                    peel_iters_prologue);
3707       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3708                    peel_iters_epilogue);
3709     }
3710
3711   /* Calculate number of iterations required to make the vector version
3712      profitable, relative to the loop bodies only.  The following condition
3713      must hold true:
3714      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3715      where
3716      SIC = scalar iteration cost, VIC = vector iteration cost,
3717      VOC = vector outside cost, VF = vectorization factor,
3718      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3719      SOC = scalar outside cost for run time cost model check.  */
3720
3721   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3722     {
3723       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3724                               * assumed_vf
3725                               - vec_inside_cost * peel_iters_prologue
3726                               - vec_inside_cost * peel_iters_epilogue);
3727       if (min_profitable_iters <= 0)
3728         min_profitable_iters = 0;
3729       else
3730         {
3731           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3732                                    - vec_inside_cost);
3733
3734           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3735               <= (((int) vec_inside_cost * min_profitable_iters)
3736                   + (((int) vec_outside_cost - scalar_outside_cost)
3737                      * assumed_vf)))
3738             min_profitable_iters++;
3739         }
3740     }
3741   /* vector version will never be profitable.  */
3742   else
3743     {
3744       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3745         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3746                     "did not happen for a simd loop");
3747
3748       if (dump_enabled_p ())
3749         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3750                          "cost model: the vector iteration cost = %d "
3751                          "divided by the scalar iteration cost = %d "
3752                          "is greater or equal to the vectorization factor = %d"
3753                          ".\n",
3754                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3755       *ret_min_profitable_niters = -1;
3756       *ret_min_profitable_estimate = -1;
3757       return;
3758     }
3759
3760   dump_printf (MSG_NOTE,
3761                "  Calculated minimum iters for profitability: %d\n",
3762                min_profitable_iters);
3763
3764   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3765       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3766     /* We want the vectorized loop to execute at least once.  */
3767     min_profitable_iters = assumed_vf + peel_iters_prologue;
3768
3769   if (dump_enabled_p ())
3770     dump_printf_loc (MSG_NOTE, vect_location,
3771                      "  Runtime profitability threshold = %d\n",
3772                      min_profitable_iters);
3773
3774   *ret_min_profitable_niters = min_profitable_iters;
3775
3776   /* Calculate number of iterations required to make the vector version
3777      profitable, relative to the loop bodies only.
3778
3779      Non-vectorized variant is SIC * niters and it must win over vector
3780      variant on the expected loop trip count.  The following condition must hold true:
3781      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3782
3783   if (vec_outside_cost <= 0)
3784     min_profitable_estimate = 0;
3785   else
3786     {
3787       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3788                                  * assumed_vf
3789                                  - vec_inside_cost * peel_iters_prologue
3790                                  - vec_inside_cost * peel_iters_epilogue)
3791                                  / ((scalar_single_iter_cost * assumed_vf)
3792                                    - vec_inside_cost);
3793     }
3794   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3795   if (dump_enabled_p ())
3796     dump_printf_loc (MSG_NOTE, vect_location,
3797                      "  Static estimate profitability threshold = %d\n",
3798                      min_profitable_estimate);
3799
3800   *ret_min_profitable_estimate = min_profitable_estimate;
3801 }
3802
3803 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3804    vector elements (not bits) for a vector with NELT elements.  */
3805 static void
3806 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3807                               vec_perm_builder *sel)
3808 {
3809   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3810      by vec_perm_indices.  */
3811   sel->new_vector (nelt, 1, 3);
3812   for (unsigned int i = 0; i < 3; i++)
3813     sel->quick_push (i + offset);
3814 }
3815
3816 /* Checks whether the target supports whole-vector shifts for vectors of mode
3817    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3818    it supports vec_perm_const with masks for all necessary shift amounts.  */
3819 static bool
3820 have_whole_vector_shift (machine_mode mode)
3821 {
3822   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3823     return true;
3824
3825   /* Variable-length vectors should be handled via the optab.  */
3826   unsigned int nelt;
3827   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3828     return false;
3829
3830   vec_perm_builder sel;
3831   vec_perm_indices indices;
3832   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3833     {
3834       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3835       indices.new_vector (sel, 2, nelt);
3836       if (!can_vec_perm_const_p (mode, indices, false))
3837         return false;
3838     }
3839   return true;
3840 }
3841
3842 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3843    functions. Design better to avoid maintenance issues.  */
3844
3845 /* Function vect_model_reduction_cost.
3846
3847    Models cost for a reduction operation, including the vector ops
3848    generated within the strip-mine loop, the initial definition before
3849    the loop, and the epilogue code that must be generated.  */
3850
3851 static void
3852 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3853                            int ncopies, stmt_vector_for_cost *cost_vec)
3854 {
3855   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3856   enum tree_code code;
3857   optab optab;
3858   tree vectype;
3859   gimple *orig_stmt;
3860   machine_mode mode;
3861   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3862   struct loop *loop = NULL;
3863
3864   if (loop_vinfo)
3865     loop = LOOP_VINFO_LOOP (loop_vinfo);
3866
3867   /* Condition reductions generate two reductions in the loop.  */
3868   vect_reduction_type reduction_type
3869     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3870   if (reduction_type == COND_REDUCTION)
3871     ncopies *= 2;
3872
3873   vectype = STMT_VINFO_VECTYPE (stmt_info);
3874   mode = TYPE_MODE (vectype);
3875   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3876
3877   if (!orig_stmt)
3878     orig_stmt = STMT_VINFO_STMT (stmt_info);
3879
3880   code = gimple_assign_rhs_code (orig_stmt);
3881
3882   if (reduction_type == EXTRACT_LAST_REDUCTION
3883       || reduction_type == FOLD_LEFT_REDUCTION)
3884     {
3885       /* No extra instructions needed in the prologue.  */
3886       prologue_cost = 0;
3887
3888       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3889         /* Count one reduction-like operation per vector.  */
3890         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3891                                         stmt_info, 0, vect_body);
3892       else
3893         {
3894           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3895           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3896           inside_cost = record_stmt_cost (cost_vec, nelements,
3897                                           vec_to_scalar, stmt_info, 0,
3898                                           vect_body);
3899           inside_cost += record_stmt_cost (cost_vec, nelements,
3900                                            scalar_stmt, stmt_info, 0,
3901                                            vect_body);
3902         }
3903     }
3904   else
3905     {
3906       /* Add in cost for initial definition.
3907          For cond reduction we have four vectors: initial index, step,
3908          initial result of the data reduction, initial value of the index
3909          reduction.  */
3910       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3911       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3912                                          scalar_to_vec, stmt_info, 0,
3913                                          vect_prologue);
3914
3915       /* Cost of reduction op inside loop.  */
3916       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3917                                       stmt_info, 0, vect_body);
3918     }
3919
3920   /* Determine cost of epilogue code.
3921
3922      We have a reduction operator that will reduce the vector in one statement.
3923      Also requires scalar extract.  */
3924
3925   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3926     {
3927       if (reduc_fn != IFN_LAST)
3928         {
3929           if (reduction_type == COND_REDUCTION)
3930             {
3931               /* An EQ stmt and an COND_EXPR stmt.  */
3932               epilogue_cost += record_stmt_cost (cost_vec, 2,
3933                                                  vector_stmt, stmt_info, 0,
3934                                                  vect_epilogue);
3935               /* Reduction of the max index and a reduction of the found
3936                  values.  */
3937               epilogue_cost += record_stmt_cost (cost_vec, 2,
3938                                                  vec_to_scalar, stmt_info, 0,
3939                                                  vect_epilogue);
3940               /* A broadcast of the max value.  */
3941               epilogue_cost += record_stmt_cost (cost_vec, 1,
3942                                                  scalar_to_vec, stmt_info, 0,
3943                                                  vect_epilogue);
3944             }
3945           else
3946             {
3947               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3948                                                  stmt_info, 0, vect_epilogue);
3949               epilogue_cost += record_stmt_cost (cost_vec, 1,
3950                                                  vec_to_scalar, stmt_info, 0,
3951                                                  vect_epilogue);
3952             }
3953         }
3954       else if (reduction_type == COND_REDUCTION)
3955         {
3956           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3957           /* Extraction of scalar elements.  */
3958           epilogue_cost += record_stmt_cost (cost_vec,
3959                                              2 * estimated_nunits,
3960                                              vec_to_scalar, stmt_info, 0,
3961                                              vect_epilogue);
3962           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3963           epilogue_cost += record_stmt_cost (cost_vec,
3964                                              2 * estimated_nunits - 3,
3965                                              scalar_stmt, stmt_info, 0,
3966                                              vect_epilogue);
3967         }
3968       else if (reduction_type == EXTRACT_LAST_REDUCTION
3969                || reduction_type == FOLD_LEFT_REDUCTION)
3970         /* No extra instructions need in the epilogue.  */
3971         ;
3972       else
3973         {
3974           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3975           tree bitsize =
3976             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3977           int element_bitsize = tree_to_uhwi (bitsize);
3978           int nelements = vec_size_in_bits / element_bitsize;
3979
3980           if (code == COND_EXPR)
3981             code = MAX_EXPR;
3982
3983           optab = optab_for_tree_code (code, vectype, optab_default);
3984
3985           /* We have a whole vector shift available.  */
3986           if (optab != unknown_optab
3987               && VECTOR_MODE_P (mode)
3988               && optab_handler (optab, mode) != CODE_FOR_nothing
3989               && have_whole_vector_shift (mode))
3990             {
3991               /* Final reduction via vector shifts and the reduction operator.
3992                  Also requires scalar extract.  */
3993               epilogue_cost += record_stmt_cost (cost_vec,
3994                                                  exact_log2 (nelements) * 2,
3995                                                  vector_stmt, stmt_info, 0,
3996                                                  vect_epilogue);
3997               epilogue_cost += record_stmt_cost (cost_vec, 1,
3998                                                  vec_to_scalar, stmt_info, 0,
3999                                                  vect_epilogue);
4000             }
4001           else
4002             /* Use extracts and reduction op for final reduction.  For N
4003                elements, we have N extracts and N-1 reduction ops.  */
4004             epilogue_cost += record_stmt_cost (cost_vec,
4005                                                nelements + nelements - 1,
4006                                                vector_stmt, stmt_info, 0,
4007                                                vect_epilogue);
4008         }
4009     }
4010
4011   if (dump_enabled_p ())
4012     dump_printf (MSG_NOTE,
4013                  "vect_model_reduction_cost: inside_cost = %d, "
4014                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4015                  prologue_cost, epilogue_cost);
4016 }
4017
4018
4019 /* Function vect_model_induction_cost.
4020
4021    Models cost for induction operations.  */
4022
4023 static void
4024 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4025                            stmt_vector_for_cost *cost_vec)
4026 {
4027   unsigned inside_cost, prologue_cost;
4028
4029   if (PURE_SLP_STMT (stmt_info))
4030     return;
4031
4032   /* loop cost for vec_loop.  */
4033   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4034                                   stmt_info, 0, vect_body);
4035
4036   /* prologue cost for vec_init and vec_step.  */
4037   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4038                                     stmt_info, 0, vect_prologue);
4039
4040   if (dump_enabled_p ())
4041     dump_printf_loc (MSG_NOTE, vect_location,
4042                      "vect_model_induction_cost: inside_cost = %d, "
4043                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4044 }
4045
4046
4047
4048 /* Function get_initial_def_for_reduction
4049
4050    Input:
4051    STMT - a stmt that performs a reduction operation in the loop.
4052    INIT_VAL - the initial value of the reduction variable
4053
4054    Output:
4055    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4056         of the reduction (used for adjusting the epilog - see below).
4057    Return a vector variable, initialized according to the operation that STMT
4058         performs. This vector will be used as the initial value of the
4059         vector of partial results.
4060
4061    Option1 (adjust in epilog): Initialize the vector as follows:
4062      add/bit or/xor:    [0,0,...,0,0]
4063      mult/bit and:      [1,1,...,1,1]
4064      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4065    and when necessary (e.g. add/mult case) let the caller know
4066    that it needs to adjust the result by init_val.
4067
4068    Option2: Initialize the vector as follows:
4069      add/bit or/xor:    [init_val,0,0,...,0]
4070      mult/bit and:      [init_val,1,1,...,1]
4071      min/max/cond_expr: [init_val,init_val,...,init_val]
4072    and no adjustments are needed.
4073
4074    For example, for the following code:
4075
4076    s = init_val;
4077    for (i=0;i<n;i++)
4078      s = s + a[i];
4079
4080    STMT is 's = s + a[i]', and the reduction variable is 's'.
4081    For a vector of 4 units, we want to return either [0,0,0,init_val],
4082    or [0,0,0,0] and let the caller know that it needs to adjust
4083    the result at the end by 'init_val'.
4084
4085    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4086    initialization vector is simpler (same element in all entries), if
4087    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4088
4089    A cost model should help decide between these two schemes.  */
4090
4091 tree
4092 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4093                                tree *adjustment_def)
4094 {
4095   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4096   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4097   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4098   tree scalar_type = TREE_TYPE (init_val);
4099   tree vectype = get_vectype_for_scalar_type (scalar_type);
4100   enum tree_code code = gimple_assign_rhs_code (stmt);
4101   tree def_for_init;
4102   tree init_def;
4103   bool nested_in_vect_loop = false;
4104   REAL_VALUE_TYPE real_init_val = dconst0;
4105   int int_init_val = 0;
4106   gimple *def_stmt = NULL;
4107   gimple_seq stmts = NULL;
4108
4109   gcc_assert (vectype);
4110
4111   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4112               || SCALAR_FLOAT_TYPE_P (scalar_type));
4113
4114   if (nested_in_vect_loop_p (loop, stmt))
4115     nested_in_vect_loop = true;
4116   else
4117     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4118
4119   /* In case of double reduction we only create a vector variable to be put
4120      in the reduction phi node.  The actual statement creation is done in
4121      vect_create_epilog_for_reduction.  */
4122   if (adjustment_def && nested_in_vect_loop
4123       && TREE_CODE (init_val) == SSA_NAME
4124       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4125       && gimple_code (def_stmt) == GIMPLE_PHI
4126       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4127       && vinfo_for_stmt (def_stmt)
4128       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4129           == vect_double_reduction_def)
4130     {
4131       *adjustment_def = NULL;
4132       return vect_create_destination_var (init_val, vectype);
4133     }
4134
4135   vect_reduction_type reduction_type
4136     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4137
4138   /* In case of a nested reduction do not use an adjustment def as
4139      that case is not supported by the epilogue generation correctly
4140      if ncopies is not one.  */
4141   if (adjustment_def && nested_in_vect_loop)
4142     {
4143       *adjustment_def = NULL;
4144       return vect_get_vec_def_for_operand (init_val, stmt);
4145     }
4146
4147   switch (code)
4148     {
4149     case WIDEN_SUM_EXPR:
4150     case DOT_PROD_EXPR:
4151     case SAD_EXPR:
4152     case PLUS_EXPR:
4153     case MINUS_EXPR:
4154     case BIT_IOR_EXPR:
4155     case BIT_XOR_EXPR:
4156     case MULT_EXPR:
4157     case BIT_AND_EXPR:
4158       {
4159         /* ADJUSTMENT_DEF is NULL when called from
4160            vect_create_epilog_for_reduction to vectorize double reduction.  */
4161         if (adjustment_def)
4162           *adjustment_def = init_val;
4163
4164         if (code == MULT_EXPR)
4165           {
4166             real_init_val = dconst1;
4167             int_init_val = 1;
4168           }
4169
4170         if (code == BIT_AND_EXPR)
4171           int_init_val = -1;
4172
4173         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4174           def_for_init = build_real (scalar_type, real_init_val);
4175         else
4176           def_for_init = build_int_cst (scalar_type, int_init_val);
4177
4178         if (adjustment_def)
4179           /* Option1: the first element is '0' or '1' as well.  */
4180           init_def = gimple_build_vector_from_val (&stmts, vectype,
4181                                                    def_for_init);
4182         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4183           {
4184             /* Option2 (variable length): the first element is INIT_VAL.  */
4185             init_def = gimple_build_vector_from_val (&stmts, vectype,
4186                                                      def_for_init);
4187             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4188                                      vectype, init_def, init_val);
4189           }
4190         else
4191           {
4192             /* Option2: the first element is INIT_VAL.  */
4193             tree_vector_builder elts (vectype, 1, 2);
4194             elts.quick_push (init_val);
4195             elts.quick_push (def_for_init);
4196             init_def = gimple_build_vector (&stmts, &elts);
4197           }
4198       }
4199       break;
4200
4201     case MIN_EXPR:
4202     case MAX_EXPR:
4203     case COND_EXPR:
4204       {
4205         if (adjustment_def)
4206           {
4207             *adjustment_def = NULL_TREE;
4208             if (reduction_type != COND_REDUCTION
4209                 && reduction_type != EXTRACT_LAST_REDUCTION)
4210               {
4211                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4212                 break;
4213               }
4214           }
4215         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4216         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4217       }
4218       break;
4219
4220     default:
4221       gcc_unreachable ();
4222     }
4223
4224   if (stmts)
4225     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4226   return init_def;
4227 }
4228
4229 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4230    NUMBER_OF_VECTORS is the number of vector defs to create.
4231    If NEUTRAL_OP is nonnull, introducing extra elements of that
4232    value will not change the result.  */
4233
4234 static void
4235 get_initial_defs_for_reduction (slp_tree slp_node,
4236                                 vec<tree> *vec_oprnds,
4237                                 unsigned int number_of_vectors,
4238                                 bool reduc_chain, tree neutral_op)
4239 {
4240   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4241   gimple *stmt = stmts[0];
4242   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4243   unsigned HOST_WIDE_INT nunits;
4244   unsigned j, number_of_places_left_in_vector;
4245   tree vector_type;
4246   tree vop;
4247   int group_size = stmts.length ();
4248   unsigned int vec_num, i;
4249   unsigned number_of_copies = 1;
4250   vec<tree> voprnds;
4251   voprnds.create (number_of_vectors);
4252   struct loop *loop;
4253   auto_vec<tree, 16> permute_results;
4254
4255   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4256
4257   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4258
4259   loop = (gimple_bb (stmt))->loop_father;
4260   gcc_assert (loop);
4261   edge pe = loop_preheader_edge (loop);
4262
4263   gcc_assert (!reduc_chain || neutral_op);
4264
4265   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4266      created vectors. It is greater than 1 if unrolling is performed.
4267
4268      For example, we have two scalar operands, s1 and s2 (e.g., group of
4269      strided accesses of size two), while NUNITS is four (i.e., four scalars
4270      of this type can be packed in a vector).  The output vector will contain
4271      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4272      will be 2).
4273
4274      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4275      vectors containing the operands.
4276
4277      For example, NUNITS is four as before, and the group size is 8
4278      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4279      {s5, s6, s7, s8}.  */
4280
4281   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4282     nunits = group_size;
4283
4284   number_of_copies = nunits * number_of_vectors / group_size;
4285
4286   number_of_places_left_in_vector = nunits;
4287   bool constant_p = true;
4288   tree_vector_builder elts (vector_type, nunits, 1);
4289   elts.quick_grow (nunits);
4290   for (j = 0; j < number_of_copies; j++)
4291     {
4292       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4293         {
4294           tree op;
4295           /* Get the def before the loop.  In reduction chain we have only
4296              one initial value.  */
4297           if ((j != (number_of_copies - 1)
4298                || (reduc_chain && i != 0))
4299               && neutral_op)
4300             op = neutral_op;
4301           else
4302             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4303
4304           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4305           number_of_places_left_in_vector--;
4306           elts[number_of_places_left_in_vector] = op;
4307           if (!CONSTANT_CLASS_P (op))
4308             constant_p = false;
4309
4310           if (number_of_places_left_in_vector == 0)
4311             {
4312               gimple_seq ctor_seq = NULL;
4313               tree init;
4314               if (constant_p && !neutral_op
4315                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4316                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4317                 /* Build the vector directly from ELTS.  */
4318                 init = gimple_build_vector (&ctor_seq, &elts);
4319               else if (neutral_op)
4320                 {
4321                   /* Build a vector of the neutral value and shift the
4322                      other elements into place.  */
4323                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4324                                                        neutral_op);
4325                   int k = nunits;
4326                   while (k > 0 && elts[k - 1] == neutral_op)
4327                     k -= 1;
4328                   while (k > 0)
4329                     {
4330                       k -= 1;
4331                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4332                                            vector_type, init, elts[k]);
4333                     }
4334                 }
4335               else
4336                 {
4337                   /* First time round, duplicate ELTS to fill the
4338                      required number of vectors, then cherry pick the
4339                      appropriate result for each iteration.  */
4340                   if (vec_oprnds->is_empty ())
4341                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4342                                               number_of_vectors,
4343                                               permute_results);
4344                   init = permute_results[number_of_vectors - j - 1];
4345                 }
4346               if (ctor_seq != NULL)
4347                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4348               voprnds.quick_push (init);
4349
4350               number_of_places_left_in_vector = nunits;
4351               elts.new_vector (vector_type, nunits, 1);
4352               elts.quick_grow (nunits);
4353               constant_p = true;
4354             }
4355         }
4356     }
4357
4358   /* Since the vectors are created in the reverse order, we should invert
4359      them.  */
4360   vec_num = voprnds.length ();
4361   for (j = vec_num; j != 0; j--)
4362     {
4363       vop = voprnds[j - 1];
4364       vec_oprnds->quick_push (vop);
4365     }
4366
4367   voprnds.release ();
4368
4369   /* In case that VF is greater than the unrolling factor needed for the SLP
4370      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4371      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4372      to replicate the vectors.  */
4373   tree neutral_vec = NULL;
4374   while (number_of_vectors > vec_oprnds->length ())
4375     {
4376       if (neutral_op)
4377         {
4378           if (!neutral_vec)
4379             {
4380               gimple_seq ctor_seq = NULL;
4381               neutral_vec = gimple_build_vector_from_val
4382                 (&ctor_seq, vector_type, neutral_op);
4383               if (ctor_seq != NULL)
4384                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4385             }
4386           vec_oprnds->quick_push (neutral_vec);
4387         }
4388       else
4389         {
4390           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4391             vec_oprnds->quick_push (vop);
4392         }
4393     }
4394 }
4395
4396
4397 /* Function vect_create_epilog_for_reduction
4398
4399    Create code at the loop-epilog to finalize the result of a reduction
4400    computation.
4401
4402    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4403      reduction statements.
4404    STMT is the scalar reduction stmt that is being vectorized.
4405    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4406      number of elements that we can fit in a vectype (nunits).  In this case
4407      we have to generate more than one vector stmt - i.e - we need to "unroll"
4408      the vector stmt by a factor VF/nunits.  For more details see documentation
4409      in vectorizable_operation.
4410    REDUC_FN is the internal function for the epilog reduction.
4411    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4412      computation.
4413    REDUC_INDEX is the index of the operand in the right hand side of the
4414      statement that is defined by REDUCTION_PHI.
4415    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4416    SLP_NODE is an SLP node containing a group of reduction statements. The
4417      first one in this group is STMT.
4418    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4419      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4420      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4421      any value of the IV in the loop.
4422    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4423    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4424      null if this is not an SLP reduction
4425
4426    This function:
4427    1. Creates the reduction def-use cycles: sets the arguments for
4428       REDUCTION_PHIS:
4429       The loop-entry argument is the vectorized initial-value of the reduction.
4430       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4431       sums.
4432    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4433       by calling the function specified by REDUC_FN if available, or by
4434       other means (whole-vector shifts or a scalar loop).
4435       The function also creates a new phi node at the loop exit to preserve
4436       loop-closed form, as illustrated below.
4437
4438      The flow at the entry to this function:
4439
4440         loop:
4441           vec_def = phi <null, null>            # REDUCTION_PHI
4442           VECT_DEF = vector_stmt                # vectorized form of STMT
4443           s_loop = scalar_stmt                  # (scalar) STMT
4444         loop_exit:
4445           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4446           use <s_out0>
4447           use <s_out0>
4448
4449      The above is transformed by this function into:
4450
4451         loop:
4452           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4453           VECT_DEF = vector_stmt                # vectorized form of STMT
4454           s_loop = scalar_stmt                  # (scalar) STMT
4455         loop_exit:
4456           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4457           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4458           v_out2 = reduce <v_out1>
4459           s_out3 = extract_field <v_out2, 0>
4460           s_out4 = adjust_result <s_out3>
4461           use <s_out4>
4462           use <s_out4>
4463 */
4464
4465 static void
4466 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4467                                   gimple *reduc_def_stmt,
4468                                   int ncopies, internal_fn reduc_fn,
4469                                   vec<gimple *> reduction_phis,
4470                                   bool double_reduc,
4471                                   slp_tree slp_node,
4472                                   slp_instance slp_node_instance,
4473                                   tree induc_val, enum tree_code induc_code,
4474                                   tree neutral_op)
4475 {
4476   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4477   stmt_vec_info prev_phi_info;
4478   tree vectype;
4479   machine_mode mode;
4480   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4481   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4482   basic_block exit_bb;
4483   tree scalar_dest;
4484   tree scalar_type;
4485   gimple *new_phi = NULL, *phi;
4486   gimple_stmt_iterator exit_gsi;
4487   tree vec_dest;
4488   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4489   gimple *epilog_stmt = NULL;
4490   enum tree_code code = gimple_assign_rhs_code (stmt);
4491   gimple *exit_phi;
4492   tree bitsize;
4493   tree adjustment_def = NULL;
4494   tree vec_initial_def = NULL;
4495   tree expr, def, initial_def = NULL;
4496   tree orig_name, scalar_result;
4497   imm_use_iterator imm_iter, phi_imm_iter;
4498   use_operand_p use_p, phi_use_p;
4499   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4500   bool nested_in_vect_loop = false;
4501   auto_vec<gimple *> new_phis;
4502   auto_vec<gimple *> inner_phis;
4503   enum vect_def_type dt = vect_unknown_def_type;
4504   int j, i;
4505   auto_vec<tree> scalar_results;
4506   unsigned int group_size = 1, k, ratio;
4507   auto_vec<tree> vec_initial_defs;
4508   auto_vec<gimple *> phis;
4509   bool slp_reduc = false;
4510   bool direct_slp_reduc;
4511   tree new_phi_result;
4512   gimple *inner_phi = NULL;
4513   tree induction_index = NULL_TREE;
4514
4515   if (slp_node)
4516     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4517
4518   if (nested_in_vect_loop_p (loop, stmt))
4519     {
4520       outer_loop = loop;
4521       loop = loop->inner;
4522       nested_in_vect_loop = true;
4523       gcc_assert (!slp_node);
4524     }
4525
4526   vectype = STMT_VINFO_VECTYPE (stmt_info);
4527   gcc_assert (vectype);
4528   mode = TYPE_MODE (vectype);
4529
4530   /* 1. Create the reduction def-use cycle:
4531      Set the arguments of REDUCTION_PHIS, i.e., transform
4532
4533         loop:
4534           vec_def = phi <null, null>            # REDUCTION_PHI
4535           VECT_DEF = vector_stmt                # vectorized form of STMT
4536           ...
4537
4538      into:
4539
4540         loop:
4541           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4542           VECT_DEF = vector_stmt                # vectorized form of STMT
4543           ...
4544
4545      (in case of SLP, do it for all the phis). */
4546
4547   /* Get the loop-entry arguments.  */
4548   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4549   if (slp_node)
4550     {
4551       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4552       vec_initial_defs.reserve (vec_num);
4553       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4554                                       &vec_initial_defs, vec_num,
4555                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4556                                       neutral_op);
4557     }
4558   else
4559     {
4560       /* Get at the scalar def before the loop, that defines the initial value
4561          of the reduction variable.  */
4562       gimple *def_stmt;
4563       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4564                                            loop_preheader_edge (loop));
4565       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4566          and we can't use zero for induc_val, use initial_def.  Similarly
4567          for REDUC_MIN and initial_def larger than the base.  */
4568       if (TREE_CODE (initial_def) == INTEGER_CST
4569           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4570               == INTEGER_INDUC_COND_REDUCTION)
4571           && !integer_zerop (induc_val)
4572           && ((induc_code == MAX_EXPR
4573                && tree_int_cst_lt (initial_def, induc_val))
4574               || (induc_code == MIN_EXPR
4575                   && tree_int_cst_lt (induc_val, initial_def))))
4576         induc_val = initial_def;
4577       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4578       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4579                                                        &adjustment_def);
4580       vec_initial_defs.create (1);
4581       vec_initial_defs.quick_push (vec_initial_def);
4582     }
4583
4584   /* Set phi nodes arguments.  */
4585   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4586     {
4587       tree vec_init_def = vec_initial_defs[i];
4588       tree def = vect_defs[i];
4589       for (j = 0; j < ncopies; j++)
4590         {
4591           if (j != 0)
4592             {
4593               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4594               if (nested_in_vect_loop)
4595                 vec_init_def
4596                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4597                                                     vec_init_def);
4598             }
4599
4600           /* Set the loop-entry arg of the reduction-phi.  */
4601
4602           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4603               == INTEGER_INDUC_COND_REDUCTION)
4604             {
4605               /* Initialise the reduction phi to zero.  This prevents initial
4606                  values of non-zero interferring with the reduction op.  */
4607               gcc_assert (ncopies == 1);
4608               gcc_assert (i == 0);
4609
4610               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4611               tree induc_val_vec
4612                 = build_vector_from_val (vec_init_def_type, induc_val);
4613
4614               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4615                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4616             }
4617           else
4618             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4619                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4620
4621           /* Set the loop-latch arg for the reduction-phi.  */
4622           if (j > 0)
4623             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4624
4625           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4626                        UNKNOWN_LOCATION);
4627
4628           if (dump_enabled_p ())
4629             {
4630               dump_printf_loc (MSG_NOTE, vect_location,
4631                                "transform reduction: created def-use cycle: ");
4632               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4633               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4634             }
4635         }
4636     }
4637
4638   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4639      which is updated with the current index of the loop for every match of
4640      the original loop's cond_expr (VEC_STMT).  This results in a vector
4641      containing the last time the condition passed for that vector lane.
4642      The first match will be a 1 to allow 0 to be used for non-matching
4643      indexes.  If there are no matches at all then the vector will be all
4644      zeroes.  */
4645   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4646     {
4647       tree indx_before_incr, indx_after_incr;
4648       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4649
4650       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4651       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4652
4653       int scalar_precision
4654         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4655       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4656       tree cr_index_vector_type = build_vector_type
4657         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4658
4659       /* First we create a simple vector induction variable which starts
4660          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4661          vector size (STEP).  */
4662
4663       /* Create a {1,2,3,...} vector.  */
4664       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4665
4666       /* Create a vector of the step value.  */
4667       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4668       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4669
4670       /* Create an induction variable.  */
4671       gimple_stmt_iterator incr_gsi;
4672       bool insert_after;
4673       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4674       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4675                  insert_after, &indx_before_incr, &indx_after_incr);
4676
4677       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4678          filled with zeros (VEC_ZERO).  */
4679
4680       /* Create a vector of 0s.  */
4681       tree zero = build_zero_cst (cr_index_scalar_type);
4682       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4683
4684       /* Create a vector phi node.  */
4685       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4686       new_phi = create_phi_node (new_phi_tree, loop->header);
4687       set_vinfo_for_stmt (new_phi,
4688                           new_stmt_vec_info (new_phi, loop_vinfo));
4689       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4690                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4691
4692       /* Now take the condition from the loops original cond_expr
4693          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4694          every match uses values from the induction variable
4695          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4696          (NEW_PHI_TREE).
4697          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4698          the new cond_expr (INDEX_COND_EXPR).  */
4699
4700       /* Duplicate the condition from vec_stmt.  */
4701       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4702
4703       /* Create a conditional, where the condition is taken from vec_stmt
4704          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4705          else is the phi (NEW_PHI_TREE).  */
4706       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4707                                      ccompare, indx_before_incr,
4708                                      new_phi_tree);
4709       induction_index = make_ssa_name (cr_index_vector_type);
4710       gimple *index_condition = gimple_build_assign (induction_index,
4711                                                      index_cond_expr);
4712       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4713       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4714                                                         loop_vinfo);
4715       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4716       set_vinfo_for_stmt (index_condition, index_vec_info);
4717
4718       /* Update the phi with the vec cond.  */
4719       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4720                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4721     }
4722
4723   /* 2. Create epilog code.
4724         The reduction epilog code operates across the elements of the vector
4725         of partial results computed by the vectorized loop.
4726         The reduction epilog code consists of:
4727
4728         step 1: compute the scalar result in a vector (v_out2)
4729         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4730         step 3: adjust the scalar result (s_out3) if needed.
4731
4732         Step 1 can be accomplished using one the following three schemes:
4733           (scheme 1) using reduc_fn, if available.
4734           (scheme 2) using whole-vector shifts, if available.
4735           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4736                      combined.
4737
4738           The overall epilog code looks like this:
4739
4740           s_out0 = phi <s_loop>         # original EXIT_PHI
4741           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4742           v_out2 = reduce <v_out1>              # step 1
4743           s_out3 = extract_field <v_out2, 0>    # step 2
4744           s_out4 = adjust_result <s_out3>       # step 3
4745
4746           (step 3 is optional, and steps 1 and 2 may be combined).
4747           Lastly, the uses of s_out0 are replaced by s_out4.  */
4748
4749
4750   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4751          v_out1 = phi <VECT_DEF>
4752          Store them in NEW_PHIS.  */
4753
4754   exit_bb = single_exit (loop)->dest;
4755   prev_phi_info = NULL;
4756   new_phis.create (vect_defs.length ());
4757   FOR_EACH_VEC_ELT (vect_defs, i, def)
4758     {
4759       for (j = 0; j < ncopies; j++)
4760         {
4761           tree new_def = copy_ssa_name (def);
4762           phi = create_phi_node (new_def, exit_bb);
4763           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4764           if (j == 0)
4765             new_phis.quick_push (phi);
4766           else
4767             {
4768               def = vect_get_vec_def_for_stmt_copy (dt, def);
4769               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4770             }
4771
4772           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4773           prev_phi_info = vinfo_for_stmt (phi);
4774         }
4775     }
4776
4777   /* The epilogue is created for the outer-loop, i.e., for the loop being
4778      vectorized.  Create exit phis for the outer loop.  */
4779   if (double_reduc)
4780     {
4781       loop = outer_loop;
4782       exit_bb = single_exit (loop)->dest;
4783       inner_phis.create (vect_defs.length ());
4784       FOR_EACH_VEC_ELT (new_phis, i, phi)
4785         {
4786           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4787           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4788           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4789                            PHI_RESULT (phi));
4790           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4791                                                             loop_vinfo));
4792           inner_phis.quick_push (phi);
4793           new_phis[i] = outer_phi;
4794           prev_phi_info = vinfo_for_stmt (outer_phi);
4795           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4796             {
4797               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4798               new_result = copy_ssa_name (PHI_RESULT (phi));
4799               outer_phi = create_phi_node (new_result, exit_bb);
4800               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4801                                PHI_RESULT (phi));
4802               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4803                                                                 loop_vinfo));
4804               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4805               prev_phi_info = vinfo_for_stmt (outer_phi);
4806             }
4807         }
4808     }
4809
4810   exit_gsi = gsi_after_labels (exit_bb);
4811
4812   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4813          (i.e. when reduc_fn is not available) and in the final adjustment
4814          code (if needed).  Also get the original scalar reduction variable as
4815          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4816          represents a reduction pattern), the tree-code and scalar-def are
4817          taken from the original stmt that the pattern-stmt (STMT) replaces.
4818          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4819          are taken from STMT.  */
4820
4821   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4822   if (!orig_stmt)
4823     {
4824       /* Regular reduction  */
4825       orig_stmt = stmt;
4826     }
4827   else
4828     {
4829       /* Reduction pattern  */
4830       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4831       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4832       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4833     }
4834
4835   code = gimple_assign_rhs_code (orig_stmt);
4836   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4837      partial results are added and not subtracted.  */
4838   if (code == MINUS_EXPR)
4839     code = PLUS_EXPR;
4840
4841   scalar_dest = gimple_assign_lhs (orig_stmt);
4842   scalar_type = TREE_TYPE (scalar_dest);
4843   scalar_results.create (group_size);
4844   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4845   bitsize = TYPE_SIZE (scalar_type);
4846
4847   /* In case this is a reduction in an inner-loop while vectorizing an outer
4848      loop - we don't need to extract a single scalar result at the end of the
4849      inner-loop (unless it is double reduction, i.e., the use of reduction is
4850      outside the outer-loop).  The final vector of partial results will be used
4851      in the vectorized outer-loop, or reduced to a scalar result at the end of
4852      the outer-loop.  */
4853   if (nested_in_vect_loop && !double_reduc)
4854     goto vect_finalize_reduction;
4855
4856   /* SLP reduction without reduction chain, e.g.,
4857      # a1 = phi <a2, a0>
4858      # b1 = phi <b2, b0>
4859      a2 = operation (a1)
4860      b2 = operation (b1)  */
4861   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4862
4863   /* True if we should implement SLP_REDUC using native reduction operations
4864      instead of scalar operations.  */
4865   direct_slp_reduc = (reduc_fn != IFN_LAST
4866                       && slp_reduc
4867                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4868
4869   /* In case of reduction chain, e.g.,
4870      # a1 = phi <a3, a0>
4871      a2 = operation (a1)
4872      a3 = operation (a2),
4873
4874      we may end up with more than one vector result.  Here we reduce them to
4875      one vector.  */
4876   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4877     {
4878       tree first_vect = PHI_RESULT (new_phis[0]);
4879       gassign *new_vec_stmt = NULL;
4880       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4881       for (k = 1; k < new_phis.length (); k++)
4882         {
4883           gimple *next_phi = new_phis[k];
4884           tree second_vect = PHI_RESULT (next_phi);
4885           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4886           new_vec_stmt = gimple_build_assign (tem, code,
4887                                               first_vect, second_vect);
4888           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4889           first_vect = tem;
4890         }
4891
4892       new_phi_result = first_vect;
4893       if (new_vec_stmt)
4894         {
4895           new_phis.truncate (0);
4896           new_phis.safe_push (new_vec_stmt);
4897         }
4898     }
4899   /* Likewise if we couldn't use a single defuse cycle.  */
4900   else if (ncopies > 1)
4901     {
4902       gcc_assert (new_phis.length () == 1);
4903       tree first_vect = PHI_RESULT (new_phis[0]);
4904       gassign *new_vec_stmt = NULL;
4905       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4906       gimple *next_phi = new_phis[0];
4907       for (int k = 1; k < ncopies; ++k)
4908         {
4909           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4910           tree second_vect = PHI_RESULT (next_phi);
4911           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4912           new_vec_stmt = gimple_build_assign (tem, code,
4913                                               first_vect, second_vect);
4914           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4915           first_vect = tem;
4916         }
4917       new_phi_result = first_vect;
4918       new_phis.truncate (0);
4919       new_phis.safe_push (new_vec_stmt);
4920     }
4921   else
4922     new_phi_result = PHI_RESULT (new_phis[0]);
4923
4924   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4925       && reduc_fn != IFN_LAST)
4926     {
4927       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4928          various data values where the condition matched and another vector
4929          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4930          need to extract the last matching index (which will be the index with
4931          highest value) and use this to index into the data vector.
4932          For the case where there were no matches, the data vector will contain
4933          all default values and the index vector will be all zeros.  */
4934
4935       /* Get various versions of the type of the vector of indexes.  */
4936       tree index_vec_type = TREE_TYPE (induction_index);
4937       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4938       tree index_scalar_type = TREE_TYPE (index_vec_type);
4939       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4940         (index_vec_type);
4941
4942       /* Get an unsigned integer version of the type of the data vector.  */
4943       int scalar_precision
4944         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4945       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4946       tree vectype_unsigned = build_vector_type
4947         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4948
4949       /* First we need to create a vector (ZERO_VEC) of zeros and another
4950          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4951          can create using a MAX reduction and then expanding.
4952          In the case where the loop never made any matches, the max index will
4953          be zero.  */
4954
4955       /* Vector of {0, 0, 0,...}.  */
4956       tree zero_vec = make_ssa_name (vectype);
4957       tree zero_vec_rhs = build_zero_cst (vectype);
4958       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4959       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4960
4961       /* Find maximum value from the vector of found indexes.  */
4962       tree max_index = make_ssa_name (index_scalar_type);
4963       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4964                                                           1, induction_index);
4965       gimple_call_set_lhs (max_index_stmt, max_index);
4966       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4967
4968       /* Vector of {max_index, max_index, max_index,...}.  */
4969       tree max_index_vec = make_ssa_name (index_vec_type);
4970       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4971                                                       max_index);
4972       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4973                                                         max_index_vec_rhs);
4974       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4975
4976       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4977          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4978          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4979          otherwise.  Only one value should match, resulting in a vector
4980          (VEC_COND) with one data value and the rest zeros.
4981          In the case where the loop never made any matches, every index will
4982          match, resulting in a vector with all data values (which will all be
4983          the default value).  */
4984
4985       /* Compare the max index vector to the vector of found indexes to find
4986          the position of the max value.  */
4987       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4988       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4989                                                       induction_index,
4990                                                       max_index_vec);
4991       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4992
4993       /* Use the compare to choose either values from the data vector or
4994          zero.  */
4995       tree vec_cond = make_ssa_name (vectype);
4996       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4997                                                    vec_compare, new_phi_result,
4998                                                    zero_vec);
4999       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5000
5001       /* Finally we need to extract the data value from the vector (VEC_COND)
5002          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5003          reduction, but because this doesn't exist, we can use a MAX reduction
5004          instead.  The data value might be signed or a float so we need to cast
5005          it first.
5006          In the case where the loop never made any matches, the data values are
5007          all identical, and so will reduce down correctly.  */
5008
5009       /* Make the matched data values unsigned.  */
5010       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5011       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5012                                        vec_cond);
5013       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5014                                                         VIEW_CONVERT_EXPR,
5015                                                         vec_cond_cast_rhs);
5016       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5017
5018       /* Reduce down to a scalar value.  */
5019       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5020       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5021                                                            1, vec_cond_cast);
5022       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5023       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5024
5025       /* Convert the reduced value back to the result type and set as the
5026          result.  */
5027       gimple_seq stmts = NULL;
5028       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5029                                data_reduc);
5030       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5031       scalar_results.safe_push (new_temp);
5032     }
5033   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5034            && reduc_fn == IFN_LAST)
5035     {
5036       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5037          idx = 0;
5038          idx_val = induction_index[0];
5039          val = data_reduc[0];
5040          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5041            if (induction_index[i] > idx_val)
5042              val = data_reduc[i], idx_val = induction_index[i];
5043          return val;  */
5044
5045       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5046       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5047       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5048       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5049       /* Enforced by vectorizable_reduction, which ensures we have target
5050          support before allowing a conditional reduction on variable-length
5051          vectors.  */
5052       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5053       tree idx_val = NULL_TREE, val = NULL_TREE;
5054       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5055         {
5056           tree old_idx_val = idx_val;
5057           tree old_val = val;
5058           idx_val = make_ssa_name (idx_eltype);
5059           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5060                                              build3 (BIT_FIELD_REF, idx_eltype,
5061                                                      induction_index,
5062                                                      bitsize_int (el_size),
5063                                                      bitsize_int (off)));
5064           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5065           val = make_ssa_name (data_eltype);
5066           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5067                                              build3 (BIT_FIELD_REF,
5068                                                      data_eltype,
5069                                                      new_phi_result,
5070                                                      bitsize_int (el_size),
5071                                                      bitsize_int (off)));
5072           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5073           if (off != 0)
5074             {
5075               tree new_idx_val = idx_val;
5076               tree new_val = val;
5077               if (off != v_size - el_size)
5078                 {
5079                   new_idx_val = make_ssa_name (idx_eltype);
5080                   epilog_stmt = gimple_build_assign (new_idx_val,
5081                                                      MAX_EXPR, idx_val,
5082                                                      old_idx_val);
5083                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5084                 }
5085               new_val = make_ssa_name (data_eltype);
5086               epilog_stmt = gimple_build_assign (new_val,
5087                                                  COND_EXPR,
5088                                                  build2 (GT_EXPR,
5089                                                          boolean_type_node,
5090                                                          idx_val,
5091                                                          old_idx_val),
5092                                                  val, old_val);
5093               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5094               idx_val = new_idx_val;
5095               val = new_val;
5096             }
5097         }
5098       /* Convert the reduced value back to the result type and set as the
5099          result.  */
5100       gimple_seq stmts = NULL;
5101       val = gimple_convert (&stmts, scalar_type, val);
5102       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5103       scalar_results.safe_push (val);
5104     }
5105
5106   /* 2.3 Create the reduction code, using one of the three schemes described
5107          above. In SLP we simply need to extract all the elements from the
5108          vector (without reducing them), so we use scalar shifts.  */
5109   else if (reduc_fn != IFN_LAST && !slp_reduc)
5110     {
5111       tree tmp;
5112       tree vec_elem_type;
5113
5114       /* Case 1:  Create:
5115          v_out2 = reduc_expr <v_out1>  */
5116
5117       if (dump_enabled_p ())
5118         dump_printf_loc (MSG_NOTE, vect_location,
5119                          "Reduce using direct vector reduction.\n");
5120
5121       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5122       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5123         {
5124           tree tmp_dest
5125             = vect_create_destination_var (scalar_dest, vec_elem_type);
5126           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5127                                                     new_phi_result);
5128           gimple_set_lhs (epilog_stmt, tmp_dest);
5129           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5130           gimple_set_lhs (epilog_stmt, new_temp);
5131           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5132
5133           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5134                                              new_temp);
5135         }
5136       else
5137         {
5138           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5139                                                     new_phi_result);
5140           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5141         }
5142
5143       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5144       gimple_set_lhs (epilog_stmt, new_temp);
5145       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5146
5147       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5148            == INTEGER_INDUC_COND_REDUCTION)
5149           && !operand_equal_p (initial_def, induc_val, 0))
5150         {
5151           /* Earlier we set the initial value to be a vector if induc_val
5152              values.  Check the result and if it is induc_val then replace
5153              with the original initial value, unless induc_val is
5154              the same as initial_def already.  */
5155           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5156                                   induc_val);
5157
5158           tmp = make_ssa_name (new_scalar_dest);
5159           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5160                                              initial_def, new_temp);
5161           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5162           new_temp = tmp;
5163         }
5164
5165       scalar_results.safe_push (new_temp);
5166     }
5167   else if (direct_slp_reduc)
5168     {
5169       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5170          with the elements for other SLP statements replaced with the
5171          neutral value.  We can then do a normal reduction on each vector.  */
5172
5173       /* Enforced by vectorizable_reduction.  */
5174       gcc_assert (new_phis.length () == 1);
5175       gcc_assert (pow2p_hwi (group_size));
5176
5177       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5178       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5179       gimple_seq seq = NULL;
5180
5181       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5182          and the same element size as VECTYPE.  */
5183       tree index = build_index_vector (vectype, 0, 1);
5184       tree index_type = TREE_TYPE (index);
5185       tree index_elt_type = TREE_TYPE (index_type);
5186       tree mask_type = build_same_sized_truth_vector_type (index_type);
5187
5188       /* Create a vector that, for each element, identifies which of
5189          the REDUC_GROUP_SIZE results should use it.  */
5190       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5191       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5192                             build_vector_from_val (index_type, index_mask));
5193
5194       /* Get a neutral vector value.  This is simply a splat of the neutral
5195          scalar value if we have one, otherwise the initial scalar value
5196          is itself a neutral value.  */
5197       tree vector_identity = NULL_TREE;
5198       if (neutral_op)
5199         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5200                                                         neutral_op);
5201       for (unsigned int i = 0; i < group_size; ++i)
5202         {
5203           /* If there's no univeral neutral value, we can use the
5204              initial scalar value from the original PHI.  This is used
5205              for MIN and MAX reduction, for example.  */
5206           if (!neutral_op)
5207             {
5208               tree scalar_value
5209                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5210                                          loop_preheader_edge (loop));
5211               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5212                                                               scalar_value);
5213             }
5214
5215           /* Calculate the equivalent of:
5216
5217              sel[j] = (index[j] == i);
5218
5219              which selects the elements of NEW_PHI_RESULT that should
5220              be included in the result.  */
5221           tree compare_val = build_int_cst (index_elt_type, i);
5222           compare_val = build_vector_from_val (index_type, compare_val);
5223           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5224                                    index, compare_val);
5225
5226           /* Calculate the equivalent of:
5227
5228              vec = seq ? new_phi_result : vector_identity;
5229
5230              VEC is now suitable for a full vector reduction.  */
5231           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5232                                    sel, new_phi_result, vector_identity);
5233
5234           /* Do the reduction and convert it to the appropriate type.  */
5235           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5236                                       TREE_TYPE (vectype), vec);
5237           scalar = gimple_convert (&seq, scalar_type, scalar);
5238           scalar_results.safe_push (scalar);
5239         }
5240       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5241     }
5242   else
5243     {
5244       bool reduce_with_shift;
5245       tree vec_temp;
5246
5247       /* COND reductions all do the final reduction with MAX_EXPR
5248          or MIN_EXPR.  */
5249       if (code == COND_EXPR)
5250         {
5251           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5252               == INTEGER_INDUC_COND_REDUCTION)
5253             code = induc_code;
5254           else
5255             code = MAX_EXPR;
5256         }
5257
5258       /* See if the target wants to do the final (shift) reduction
5259          in a vector mode of smaller size and first reduce upper/lower
5260          halves against each other.  */
5261       enum machine_mode mode1 = mode;
5262       tree vectype1 = vectype;
5263       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5264       unsigned sz1 = sz;
5265       if (!slp_reduc
5266           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5267         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5268
5269       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5270       reduce_with_shift = have_whole_vector_shift (mode1);
5271       if (!VECTOR_MODE_P (mode1))
5272         reduce_with_shift = false;
5273       else
5274         {
5275           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5276           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5277             reduce_with_shift = false;
5278         }
5279
5280       /* First reduce the vector to the desired vector size we should
5281          do shift reduction on by combining upper and lower halves.  */
5282       new_temp = new_phi_result;
5283       while (sz > sz1)
5284         {
5285           gcc_assert (!slp_reduc);
5286           sz /= 2;
5287           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5288
5289           /* The target has to make sure we support lowpart/highpart
5290              extraction, either via direct vector extract or through
5291              an integer mode punning.  */
5292           tree dst1, dst2;
5293           if (convert_optab_handler (vec_extract_optab,
5294                                      TYPE_MODE (TREE_TYPE (new_temp)),
5295                                      TYPE_MODE (vectype1))
5296               != CODE_FOR_nothing)
5297             {
5298               /* Extract sub-vectors directly once vec_extract becomes
5299                  a conversion optab.  */
5300               dst1 = make_ssa_name (vectype1);
5301               epilog_stmt
5302                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5303                                          build3 (BIT_FIELD_REF, vectype1,
5304                                                  new_temp, TYPE_SIZE (vectype1),
5305                                                  bitsize_int (0)));
5306               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5307               dst2 =  make_ssa_name (vectype1);
5308               epilog_stmt
5309                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5310                                          build3 (BIT_FIELD_REF, vectype1,
5311                                                  new_temp, TYPE_SIZE (vectype1),
5312                                                  bitsize_int (sz * BITS_PER_UNIT)));
5313               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5314             }
5315           else
5316             {
5317               /* Extract via punning to appropriately sized integer mode
5318                  vector.  */
5319               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5320                                                             1);
5321               tree etype = build_vector_type (eltype, 2);
5322               gcc_assert (convert_optab_handler (vec_extract_optab,
5323                                                  TYPE_MODE (etype),
5324                                                  TYPE_MODE (eltype))
5325                           != CODE_FOR_nothing);
5326               tree tem = make_ssa_name (etype);
5327               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5328                                                  build1 (VIEW_CONVERT_EXPR,
5329                                                          etype, new_temp));
5330               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5331               new_temp = tem;
5332               tem = make_ssa_name (eltype);
5333               epilog_stmt
5334                   = gimple_build_assign (tem, BIT_FIELD_REF,
5335                                          build3 (BIT_FIELD_REF, eltype,
5336                                                  new_temp, TYPE_SIZE (eltype),
5337                                                  bitsize_int (0)));
5338               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5339               dst1 = make_ssa_name (vectype1);
5340               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5341                                                  build1 (VIEW_CONVERT_EXPR,
5342                                                          vectype1, tem));
5343               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5344               tem = make_ssa_name (eltype);
5345               epilog_stmt
5346                   = gimple_build_assign (tem, BIT_FIELD_REF,
5347                                          build3 (BIT_FIELD_REF, eltype,
5348                                                  new_temp, TYPE_SIZE (eltype),
5349                                                  bitsize_int (sz * BITS_PER_UNIT)));
5350               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5351               dst2 =  make_ssa_name (vectype1);
5352               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5353                                                  build1 (VIEW_CONVERT_EXPR,
5354                                                          vectype1, tem));
5355               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5356             }
5357
5358           new_temp = make_ssa_name (vectype1);
5359           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5360           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5361         }
5362
5363       if (reduce_with_shift && !slp_reduc)
5364         {
5365           int element_bitsize = tree_to_uhwi (bitsize);
5366           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5367              for variable-length vectors and also requires direct target support
5368              for loop reductions.  */
5369           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5370           int nelements = vec_size_in_bits / element_bitsize;
5371           vec_perm_builder sel;
5372           vec_perm_indices indices;
5373
5374           int elt_offset;
5375
5376           tree zero_vec = build_zero_cst (vectype1);
5377           /* Case 2: Create:
5378              for (offset = nelements/2; offset >= 1; offset/=2)
5379                 {
5380                   Create:  va' = vec_shift <va, offset>
5381                   Create:  va = vop <va, va'>
5382                 }  */
5383
5384           tree rhs;
5385
5386           if (dump_enabled_p ())
5387             dump_printf_loc (MSG_NOTE, vect_location,
5388                              "Reduce using vector shifts\n");
5389
5390           mode1 = TYPE_MODE (vectype1);
5391           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5392           for (elt_offset = nelements / 2;
5393                elt_offset >= 1;
5394                elt_offset /= 2)
5395             {
5396               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5397               indices.new_vector (sel, 2, nelements);
5398               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5399               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5400                                                  new_temp, zero_vec, mask);
5401               new_name = make_ssa_name (vec_dest, epilog_stmt);
5402               gimple_assign_set_lhs (epilog_stmt, new_name);
5403               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5404
5405               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5406                                                  new_temp);
5407               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5408               gimple_assign_set_lhs (epilog_stmt, new_temp);
5409               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5410             }
5411
5412           /* 2.4  Extract the final scalar result.  Create:
5413              s_out3 = extract_field <v_out2, bitpos>  */
5414
5415           if (dump_enabled_p ())
5416             dump_printf_loc (MSG_NOTE, vect_location,
5417                              "extract scalar result\n");
5418
5419           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5420                         bitsize, bitsize_zero_node);
5421           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5422           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5423           gimple_assign_set_lhs (epilog_stmt, new_temp);
5424           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5425           scalar_results.safe_push (new_temp);
5426         }
5427       else
5428         {
5429           /* Case 3: Create:
5430              s = extract_field <v_out2, 0>
5431              for (offset = element_size;
5432                   offset < vector_size;
5433                   offset += element_size;)
5434                {
5435                  Create:  s' = extract_field <v_out2, offset>
5436                  Create:  s = op <s, s'>  // For non SLP cases
5437                }  */
5438
5439           if (dump_enabled_p ())
5440             dump_printf_loc (MSG_NOTE, vect_location,
5441                              "Reduce using scalar code.\n");
5442
5443           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5444           int element_bitsize = tree_to_uhwi (bitsize);
5445           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5446             {
5447               int bit_offset;
5448               if (gimple_code (new_phi) == GIMPLE_PHI)
5449                 vec_temp = PHI_RESULT (new_phi);
5450               else
5451                 vec_temp = gimple_assign_lhs (new_phi);
5452               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5453                                  bitsize_zero_node);
5454               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5455               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5456               gimple_assign_set_lhs (epilog_stmt, new_temp);
5457               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5458
5459               /* In SLP we don't need to apply reduction operation, so we just
5460                  collect s' values in SCALAR_RESULTS.  */
5461               if (slp_reduc)
5462                 scalar_results.safe_push (new_temp);
5463
5464               for (bit_offset = element_bitsize;
5465                    bit_offset < vec_size_in_bits;
5466                    bit_offset += element_bitsize)
5467                 {
5468                   tree bitpos = bitsize_int (bit_offset);
5469                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5470                                      bitsize, bitpos);
5471
5472                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5473                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5474                   gimple_assign_set_lhs (epilog_stmt, new_name);
5475                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5476
5477                   if (slp_reduc)
5478                     {
5479                       /* In SLP we don't need to apply reduction operation, so
5480                          we just collect s' values in SCALAR_RESULTS.  */
5481                       new_temp = new_name;
5482                       scalar_results.safe_push (new_name);
5483                     }
5484                   else
5485                     {
5486                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5487                                                          new_name, new_temp);
5488                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5489                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5490                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5491                     }
5492                 }
5493             }
5494
5495           /* The only case where we need to reduce scalar results in SLP, is
5496              unrolling.  If the size of SCALAR_RESULTS is greater than
5497              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5498              REDUC_GROUP_SIZE.  */
5499           if (slp_reduc)
5500             {
5501               tree res, first_res, new_res;
5502               gimple *new_stmt;
5503
5504               /* Reduce multiple scalar results in case of SLP unrolling.  */
5505               for (j = group_size; scalar_results.iterate (j, &res);
5506                    j++)
5507                 {
5508                   first_res = scalar_results[j % group_size];
5509                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5510                                                   first_res, res);
5511                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5512                   gimple_assign_set_lhs (new_stmt, new_res);
5513                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5514                   scalar_results[j % group_size] = new_res;
5515                 }
5516             }
5517           else
5518             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5519             scalar_results.safe_push (new_temp);
5520         }
5521
5522       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5523            == INTEGER_INDUC_COND_REDUCTION)
5524           && !operand_equal_p (initial_def, induc_val, 0))
5525         {
5526           /* Earlier we set the initial value to be a vector if induc_val
5527              values.  Check the result and if it is induc_val then replace
5528              with the original initial value, unless induc_val is
5529              the same as initial_def already.  */
5530           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5531                                   induc_val);
5532
5533           tree tmp = make_ssa_name (new_scalar_dest);
5534           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5535                                              initial_def, new_temp);
5536           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5537           scalar_results[0] = tmp;
5538         }
5539     }
5540
5541 vect_finalize_reduction:
5542
5543   if (double_reduc)
5544     loop = loop->inner;
5545
5546   /* 2.5 Adjust the final result by the initial value of the reduction
5547          variable. (When such adjustment is not needed, then
5548          'adjustment_def' is zero).  For example, if code is PLUS we create:
5549          new_temp = loop_exit_def + adjustment_def  */
5550
5551   if (adjustment_def)
5552     {
5553       gcc_assert (!slp_reduc);
5554       if (nested_in_vect_loop)
5555         {
5556           new_phi = new_phis[0];
5557           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5558           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5559           new_dest = vect_create_destination_var (scalar_dest, vectype);
5560         }
5561       else
5562         {
5563           new_temp = scalar_results[0];
5564           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5565           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5566           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5567         }
5568
5569       epilog_stmt = gimple_build_assign (new_dest, expr);
5570       new_temp = make_ssa_name (new_dest, epilog_stmt);
5571       gimple_assign_set_lhs (epilog_stmt, new_temp);
5572       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5573       if (nested_in_vect_loop)
5574         {
5575           set_vinfo_for_stmt (epilog_stmt,
5576                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5577           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5578                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5579
5580           if (!double_reduc)
5581             scalar_results.quick_push (new_temp);
5582           else
5583             scalar_results[0] = new_temp;
5584         }
5585       else
5586         scalar_results[0] = new_temp;
5587
5588       new_phis[0] = epilog_stmt;
5589     }
5590
5591   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5592           phis with new adjusted scalar results, i.e., replace use <s_out0>
5593           with use <s_out4>.
5594
5595      Transform:
5596         loop_exit:
5597           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5598           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5599           v_out2 = reduce <v_out1>
5600           s_out3 = extract_field <v_out2, 0>
5601           s_out4 = adjust_result <s_out3>
5602           use <s_out0>
5603           use <s_out0>
5604
5605      into:
5606
5607         loop_exit:
5608           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5609           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5610           v_out2 = reduce <v_out1>
5611           s_out3 = extract_field <v_out2, 0>
5612           s_out4 = adjust_result <s_out3>
5613           use <s_out4>
5614           use <s_out4> */
5615
5616
5617   /* In SLP reduction chain we reduce vector results into one vector if
5618      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5619      LHS of the last stmt in the reduction chain, since we are looking for
5620      the loop exit phi node.  */
5621   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5622     {
5623       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5624       /* Handle reduction patterns.  */
5625       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5626         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5627
5628       scalar_dest = gimple_assign_lhs (dest_stmt);
5629       group_size = 1;
5630     }
5631
5632   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5633      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5634      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5635      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5636      correspond to the first vector stmt, etc.
5637      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5638   if (group_size > new_phis.length ())
5639     {
5640       ratio = group_size / new_phis.length ();
5641       gcc_assert (!(group_size % new_phis.length ()));
5642     }
5643   else
5644     ratio = 1;
5645
5646   for (k = 0; k < group_size; k++)
5647     {
5648       if (k % ratio == 0)
5649         {
5650           epilog_stmt = new_phis[k / ratio];
5651           reduction_phi = reduction_phis[k / ratio];
5652           if (double_reduc)
5653             inner_phi = inner_phis[k / ratio];
5654         }
5655
5656       if (slp_reduc)
5657         {
5658           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5659
5660           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5661           /* SLP statements can't participate in patterns.  */
5662           gcc_assert (!orig_stmt);
5663           scalar_dest = gimple_assign_lhs (current_stmt);
5664         }
5665
5666       phis.create (3);
5667       /* Find the loop-closed-use at the loop exit of the original scalar
5668          result.  (The reduction result is expected to have two immediate uses -
5669          one at the latch block, and one at the loop exit).  */
5670       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5671         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5672             && !is_gimple_debug (USE_STMT (use_p)))
5673           phis.safe_push (USE_STMT (use_p));
5674
5675       /* While we expect to have found an exit_phi because of loop-closed-ssa
5676          form we can end up without one if the scalar cycle is dead.  */
5677
5678       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5679         {
5680           if (outer_loop)
5681             {
5682               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5683               gphi *vect_phi;
5684
5685               /* FORNOW. Currently not supporting the case that an inner-loop
5686                  reduction is not used in the outer-loop (but only outside the
5687                  outer-loop), unless it is double reduction.  */
5688               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5689                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5690                           || double_reduc);
5691
5692               if (double_reduc)
5693                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5694               else
5695                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5696               if (!double_reduc
5697                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5698                       != vect_double_reduction_def)
5699                 continue;
5700
5701               /* Handle double reduction:
5702
5703                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5704                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5705                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5706                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5707
5708                  At that point the regular reduction (stmt2 and stmt3) is
5709                  already vectorized, as well as the exit phi node, stmt4.
5710                  Here we vectorize the phi node of double reduction, stmt1, and
5711                  update all relevant statements.  */
5712
5713               /* Go through all the uses of s2 to find double reduction phi
5714                  node, i.e., stmt1 above.  */
5715               orig_name = PHI_RESULT (exit_phi);
5716               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5717                 {
5718                   stmt_vec_info use_stmt_vinfo;
5719                   stmt_vec_info new_phi_vinfo;
5720                   tree vect_phi_init, preheader_arg, vect_phi_res;
5721                   basic_block bb = gimple_bb (use_stmt);
5722                   gimple *use;
5723
5724                   /* Check that USE_STMT is really double reduction phi
5725                      node.  */
5726                   if (gimple_code (use_stmt) != GIMPLE_PHI
5727                       || gimple_phi_num_args (use_stmt) != 2
5728                       || bb->loop_father != outer_loop)
5729                     continue;
5730                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5731                   if (!use_stmt_vinfo
5732                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5733                           != vect_double_reduction_def)
5734                     continue;
5735
5736                   /* Create vector phi node for double reduction:
5737                      vs1 = phi <vs0, vs2>
5738                      vs1 was created previously in this function by a call to
5739                        vect_get_vec_def_for_operand and is stored in
5740                        vec_initial_def;
5741                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5742                      vs0 is created here.  */
5743
5744                   /* Create vector phi node.  */
5745                   vect_phi = create_phi_node (vec_initial_def, bb);
5746                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5747                                     loop_vec_info_for_loop (outer_loop));
5748                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5749
5750                   /* Create vs0 - initial def of the double reduction phi.  */
5751                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5752                                              loop_preheader_edge (outer_loop));
5753                   vect_phi_init = get_initial_def_for_reduction
5754                     (stmt, preheader_arg, NULL);
5755
5756                   /* Update phi node arguments with vs0 and vs2.  */
5757                   add_phi_arg (vect_phi, vect_phi_init,
5758                                loop_preheader_edge (outer_loop),
5759                                UNKNOWN_LOCATION);
5760                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5761                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5762                   if (dump_enabled_p ())
5763                     {
5764                       dump_printf_loc (MSG_NOTE, vect_location,
5765                                        "created double reduction phi node: ");
5766                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5767                     }
5768
5769                   vect_phi_res = PHI_RESULT (vect_phi);
5770
5771                   /* Replace the use, i.e., set the correct vs1 in the regular
5772                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5773                      loop is redundant.  */
5774                   use = reduction_phi;
5775                   for (j = 0; j < ncopies; j++)
5776                     {
5777                       edge pr_edge = loop_preheader_edge (loop);
5778                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5779                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5780                     }
5781                 }
5782             }
5783         }
5784
5785       phis.release ();
5786       if (nested_in_vect_loop)
5787         {
5788           if (double_reduc)
5789             loop = outer_loop;
5790           else
5791             continue;
5792         }
5793
5794       phis.create (3);
5795       /* Find the loop-closed-use at the loop exit of the original scalar
5796          result.  (The reduction result is expected to have two immediate uses,
5797          one at the latch block, and one at the loop exit).  For double
5798          reductions we are looking for exit phis of the outer loop.  */
5799       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5800         {
5801           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5802             {
5803               if (!is_gimple_debug (USE_STMT (use_p)))
5804                 phis.safe_push (USE_STMT (use_p));
5805             }
5806           else
5807             {
5808               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5809                 {
5810                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5811
5812                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5813                     {
5814                       if (!flow_bb_inside_loop_p (loop,
5815                                              gimple_bb (USE_STMT (phi_use_p)))
5816                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5817                         phis.safe_push (USE_STMT (phi_use_p));
5818                     }
5819                 }
5820             }
5821         }
5822
5823       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5824         {
5825           /* Replace the uses:  */
5826           orig_name = PHI_RESULT (exit_phi);
5827           scalar_result = scalar_results[k];
5828           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5829             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5830               SET_USE (use_p, scalar_result);
5831         }
5832
5833       phis.release ();
5834     }
5835 }
5836
5837 /* Return a vector of type VECTYPE that is equal to the vector select
5838    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5839    before GSI.  */
5840
5841 static tree
5842 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5843                      tree vec, tree identity)
5844 {
5845   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5846   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5847                                           mask, vec, identity);
5848   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5849   return cond;
5850 }
5851
5852 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5853    order, starting with LHS.  Insert the extraction statements before GSI and
5854    associate the new scalar SSA names with variable SCALAR_DEST.
5855    Return the SSA name for the result.  */
5856
5857 static tree
5858 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5859                        tree_code code, tree lhs, tree vector_rhs)
5860 {
5861   tree vectype = TREE_TYPE (vector_rhs);
5862   tree scalar_type = TREE_TYPE (vectype);
5863   tree bitsize = TYPE_SIZE (scalar_type);
5864   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5865   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5866
5867   for (unsigned HOST_WIDE_INT bit_offset = 0;
5868        bit_offset < vec_size_in_bits;
5869        bit_offset += element_bitsize)
5870     {
5871       tree bitpos = bitsize_int (bit_offset);
5872       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5873                          bitsize, bitpos);
5874
5875       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5876       rhs = make_ssa_name (scalar_dest, stmt);
5877       gimple_assign_set_lhs (stmt, rhs);
5878       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5879
5880       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5881       tree new_name = make_ssa_name (scalar_dest, stmt);
5882       gimple_assign_set_lhs (stmt, new_name);
5883       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5884       lhs = new_name;
5885     }
5886   return lhs;
5887 }
5888
5889 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5890    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5891    statement.  CODE is the operation performed by STMT and OPS are
5892    its scalar operands.  REDUC_INDEX is the index of the operand in
5893    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5894    implements in-order reduction, or IFN_LAST if we should open-code it.
5895    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5896    that should be used to control the operation in a fully-masked loop.  */
5897
5898 static bool
5899 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5900                                gimple **vec_stmt, slp_tree slp_node,
5901                                gimple *reduc_def_stmt,
5902                                tree_code code, internal_fn reduc_fn,
5903                                tree ops[3], tree vectype_in,
5904                                int reduc_index, vec_loop_masks *masks)
5905 {
5906   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5907   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5908   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5909   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5910   gimple *new_stmt = NULL;
5911
5912   int ncopies;
5913   if (slp_node)
5914     ncopies = 1;
5915   else
5916     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5917
5918   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5919   gcc_assert (ncopies == 1);
5920   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5921   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5922   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5923               == FOLD_LEFT_REDUCTION);
5924
5925   if (slp_node)
5926     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5927                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5928
5929   tree op0 = ops[1 - reduc_index];
5930
5931   int group_size = 1;
5932   gimple *scalar_dest_def;
5933   auto_vec<tree> vec_oprnds0;
5934   if (slp_node)
5935     {
5936       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5937       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5938       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5939     }
5940   else
5941     {
5942       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5943       vec_oprnds0.create (1);
5944       vec_oprnds0.quick_push (loop_vec_def0);
5945       scalar_dest_def = stmt;
5946     }
5947
5948   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5949   tree scalar_type = TREE_TYPE (scalar_dest);
5950   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5951
5952   int vec_num = vec_oprnds0.length ();
5953   gcc_assert (vec_num == 1 || slp_node);
5954   tree vec_elem_type = TREE_TYPE (vectype_out);
5955   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5956
5957   tree vector_identity = NULL_TREE;
5958   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5959     vector_identity = build_zero_cst (vectype_out);
5960
5961   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5962   int i;
5963   tree def0;
5964   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5965     {
5966       tree mask = NULL_TREE;
5967       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5968         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5969
5970       /* Handle MINUS by adding the negative.  */
5971       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5972         {
5973           tree negated = make_ssa_name (vectype_out);
5974           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5975           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5976           def0 = negated;
5977         }
5978
5979       if (mask)
5980         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5981                                     vector_identity);
5982
5983       /* On the first iteration the input is simply the scalar phi
5984          result, and for subsequent iterations it is the output of
5985          the preceding operation.  */
5986       if (reduc_fn != IFN_LAST)
5987         {
5988           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5989           /* For chained SLP reductions the output of the previous reduction
5990              operation serves as the input of the next. For the final statement
5991              the output cannot be a temporary - we reuse the original
5992              scalar destination of the last statement.  */
5993           if (i != vec_num - 1)
5994             {
5995               gimple_set_lhs (new_stmt, scalar_dest_var);
5996               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5997               gimple_set_lhs (new_stmt, reduc_var);
5998             }
5999         }
6000       else
6001         {
6002           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6003                                              reduc_var, def0);
6004           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6005           /* Remove the statement, so that we can use the same code paths
6006              as for statements that we've just created.  */
6007           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6008           gsi_remove (&tmp_gsi, false);
6009         }
6010
6011       if (i == vec_num - 1)
6012         {
6013           gimple_set_lhs (new_stmt, scalar_dest);
6014           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6015         }
6016       else
6017         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6018
6019       if (slp_node)
6020         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6021     }
6022
6023   if (!slp_node)
6024     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6025
6026   return true;
6027 }
6028
6029 /* Function is_nonwrapping_integer_induction.
6030
6031    Check if STMT (which is part of loop LOOP) both increments and
6032    does not cause overflow.  */
6033
6034 static bool
6035 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6036 {
6037   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6038   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6039   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6040   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6041   widest_int ni, max_loop_value, lhs_max;
6042   bool overflow = false;
6043
6044   /* Make sure the loop is integer based.  */
6045   if (TREE_CODE (base) != INTEGER_CST
6046       || TREE_CODE (step) != INTEGER_CST)
6047     return false;
6048
6049   /* Check that the max size of the loop will not wrap.  */
6050
6051   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6052     return true;
6053
6054   if (! max_stmt_executions (loop, &ni))
6055     return false;
6056
6057   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6058                             &overflow);
6059   if (overflow)
6060     return false;
6061
6062   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6063                             TYPE_SIGN (lhs_type), &overflow);
6064   if (overflow)
6065     return false;
6066
6067   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6068           <= TYPE_PRECISION (lhs_type));
6069 }
6070
6071 /* Function vectorizable_reduction.
6072
6073    Check if STMT performs a reduction operation that can be vectorized.
6074    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6075    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6076    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6077
6078    This function also handles reduction idioms (patterns) that have been
6079    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6080    of this form:
6081      X = pattern_expr (arg0, arg1, ..., X)
6082    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6083    sequence that had been detected and replaced by the pattern-stmt (STMT).
6084
6085    This function also handles reduction of condition expressions, for example:
6086      for (int i = 0; i < N; i++)
6087        if (a[i] < value)
6088          last = a[i];
6089    This is handled by vectorising the loop and creating an additional vector
6090    containing the loop indexes for which "a[i] < value" was true.  In the
6091    function epilogue this is reduced to a single max value and then used to
6092    index into the vector of results.
6093
6094    In some cases of reduction patterns, the type of the reduction variable X is
6095    different than the type of the other arguments of STMT.
6096    In such cases, the vectype that is used when transforming STMT into a vector
6097    stmt is different than the vectype that is used to determine the
6098    vectorization factor, because it consists of a different number of elements
6099    than the actual number of elements that are being operated upon in parallel.
6100
6101    For example, consider an accumulation of shorts into an int accumulator.
6102    On some targets it's possible to vectorize this pattern operating on 8
6103    shorts at a time (hence, the vectype for purposes of determining the
6104    vectorization factor should be V8HI); on the other hand, the vectype that
6105    is used to create the vector form is actually V4SI (the type of the result).
6106
6107    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6108    indicates what is the actual level of parallelism (V8HI in the example), so
6109    that the right vectorization factor would be derived.  This vectype
6110    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6111    be used to create the vectorized stmt.  The right vectype for the vectorized
6112    stmt is obtained from the type of the result X:
6113         get_vectype_for_scalar_type (TREE_TYPE (X))
6114
6115    This means that, contrary to "regular" reductions (or "regular" stmts in
6116    general), the following equation:
6117       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6118    does *NOT* necessarily hold for reduction patterns.  */
6119
6120 bool
6121 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6122                         gimple **vec_stmt, slp_tree slp_node,
6123                         slp_instance slp_node_instance,
6124                         stmt_vector_for_cost *cost_vec)
6125 {
6126   tree vec_dest;
6127   tree scalar_dest;
6128   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6129   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6130   tree vectype_in = NULL_TREE;
6131   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6132   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6133   enum tree_code code, orig_code;
6134   internal_fn reduc_fn;
6135   machine_mode vec_mode;
6136   int op_type;
6137   optab optab;
6138   tree new_temp = NULL_TREE;
6139   gimple *def_stmt;
6140   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6141   gimple *cond_reduc_def_stmt = NULL;
6142   enum tree_code cond_reduc_op_code = ERROR_MARK;
6143   tree scalar_type;
6144   bool is_simple_use;
6145   gimple *orig_stmt;
6146   stmt_vec_info orig_stmt_info = NULL;
6147   int i;
6148   int ncopies;
6149   int epilog_copies;
6150   stmt_vec_info prev_stmt_info, prev_phi_info;
6151   bool single_defuse_cycle = false;
6152   gimple *new_stmt = NULL;
6153   int j;
6154   tree ops[3];
6155   enum vect_def_type dts[3];
6156   bool nested_cycle = false, found_nested_cycle_def = false;
6157   bool double_reduc = false;
6158   basic_block def_bb;
6159   struct loop * def_stmt_loop, *outer_loop = NULL;
6160   tree def_arg;
6161   gimple *def_arg_stmt;
6162   auto_vec<tree> vec_oprnds0;
6163   auto_vec<tree> vec_oprnds1;
6164   auto_vec<tree> vec_oprnds2;
6165   auto_vec<tree> vect_defs;
6166   auto_vec<gimple *> phis;
6167   int vec_num;
6168   tree def0, tem;
6169   bool first_p = true;
6170   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6171   tree cond_reduc_val = NULL_TREE;
6172
6173   /* Make sure it was already recognized as a reduction computation.  */
6174   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6175       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6176     return false;
6177
6178   if (nested_in_vect_loop_p (loop, stmt))
6179     {
6180       outer_loop = loop;
6181       loop = loop->inner;
6182       nested_cycle = true;
6183     }
6184
6185   /* In case of reduction chain we switch to the first stmt in the chain, but
6186      we don't update STMT_INFO, since only the last stmt is marked as reduction
6187      and has reduction properties.  */
6188   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6189       && REDUC_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6190     {
6191       stmt = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6192       first_p = false;
6193     }
6194
6195   if (gimple_code (stmt) == GIMPLE_PHI)
6196     {
6197       /* Analysis is fully done on the reduction stmt invocation.  */
6198       if (! vec_stmt)
6199         {
6200           if (slp_node)
6201             slp_node_instance->reduc_phis = slp_node;
6202
6203           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6204           return true;
6205         }
6206
6207       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6208         /* Leave the scalar phi in place.  Note that checking
6209            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6210            for reductions involving a single statement.  */
6211         return true;
6212
6213       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6214       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6215         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6216
6217       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6218           == EXTRACT_LAST_REDUCTION)
6219         /* Leave the scalar phi in place.  */
6220         return true;
6221
6222       gcc_assert (is_gimple_assign (reduc_stmt));
6223       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6224         {
6225           tree op = gimple_op (reduc_stmt, k);
6226           if (op == gimple_phi_result (stmt))
6227             continue;
6228           if (k == 1
6229               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6230             continue;
6231           if (!vectype_in
6232               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6233                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6234             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6235           break;
6236         }
6237       gcc_assert (vectype_in);
6238
6239       if (slp_node)
6240         ncopies = 1;
6241       else
6242         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6243
6244       use_operand_p use_p;
6245       gimple *use_stmt;
6246       if (ncopies > 1
6247           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6248               <= vect_used_only_live)
6249           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6250           && (use_stmt == reduc_stmt
6251               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6252                   == reduc_stmt)))
6253         single_defuse_cycle = true;
6254
6255       /* Create the destination vector  */
6256       scalar_dest = gimple_assign_lhs (reduc_stmt);
6257       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6258
6259       if (slp_node)
6260         /* The size vect_schedule_slp_instance computes is off for us.  */
6261         vec_num = vect_get_num_vectors
6262           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6263            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6264            vectype_in);
6265       else
6266         vec_num = 1;
6267
6268       /* Generate the reduction PHIs upfront.  */
6269       prev_phi_info = NULL;
6270       for (j = 0; j < ncopies; j++)
6271         {
6272           if (j == 0 || !single_defuse_cycle)
6273             {
6274               for (i = 0; i < vec_num; i++)
6275                 {
6276                   /* Create the reduction-phi that defines the reduction
6277                      operand.  */
6278                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6279                   set_vinfo_for_stmt (new_phi,
6280                                       new_stmt_vec_info (new_phi, loop_vinfo));
6281
6282                   if (slp_node)
6283                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6284                   else
6285                     {
6286                       if (j == 0)
6287                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6288                       else
6289                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6290                       prev_phi_info = vinfo_for_stmt (new_phi);
6291                     }
6292                 }
6293             }
6294         }
6295
6296       return true;
6297     }
6298
6299   /* 1. Is vectorizable reduction?  */
6300   /* Not supportable if the reduction variable is used in the loop, unless
6301      it's a reduction chain.  */
6302   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6303       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6304     return false;
6305
6306   /* Reductions that are not used even in an enclosing outer-loop,
6307      are expected to be "live" (used out of the loop).  */
6308   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6309       && !STMT_VINFO_LIVE_P (stmt_info))
6310     return false;
6311
6312   /* 2. Has this been recognized as a reduction pattern?
6313
6314      Check if STMT represents a pattern that has been recognized
6315      in earlier analysis stages.  For stmts that represent a pattern,
6316      the STMT_VINFO_RELATED_STMT field records the last stmt in
6317      the original sequence that constitutes the pattern.  */
6318
6319   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6320   if (orig_stmt)
6321     {
6322       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6323       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6324       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6325     }
6326
6327   /* 3. Check the operands of the operation.  The first operands are defined
6328         inside the loop body. The last operand is the reduction variable,
6329         which is defined by the loop-header-phi.  */
6330
6331   gcc_assert (is_gimple_assign (stmt));
6332
6333   /* Flatten RHS.  */
6334   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6335     {
6336     case GIMPLE_BINARY_RHS:
6337       code = gimple_assign_rhs_code (stmt);
6338       op_type = TREE_CODE_LENGTH (code);
6339       gcc_assert (op_type == binary_op);
6340       ops[0] = gimple_assign_rhs1 (stmt);
6341       ops[1] = gimple_assign_rhs2 (stmt);
6342       break;
6343
6344     case GIMPLE_TERNARY_RHS:
6345       code = gimple_assign_rhs_code (stmt);
6346       op_type = TREE_CODE_LENGTH (code);
6347       gcc_assert (op_type == ternary_op);
6348       ops[0] = gimple_assign_rhs1 (stmt);
6349       ops[1] = gimple_assign_rhs2 (stmt);
6350       ops[2] = gimple_assign_rhs3 (stmt);
6351       break;
6352
6353     case GIMPLE_UNARY_RHS:
6354       return false;
6355
6356     default:
6357       gcc_unreachable ();
6358     }
6359
6360   if (code == COND_EXPR && slp_node)
6361     return false;
6362
6363   scalar_dest = gimple_assign_lhs (stmt);
6364   scalar_type = TREE_TYPE (scalar_dest);
6365   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6366       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6367     return false;
6368
6369   /* Do not try to vectorize bit-precision reductions.  */
6370   if (!type_has_mode_precision_p (scalar_type))
6371     return false;
6372
6373   /* All uses but the last are expected to be defined in the loop.
6374      The last use is the reduction variable.  In case of nested cycle this
6375      assumption is not true: we use reduc_index to record the index of the
6376      reduction variable.  */
6377   gimple *reduc_def_stmt = NULL;
6378   int reduc_index = -1;
6379   for (i = 0; i < op_type; i++)
6380     {
6381       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6382       if (i == 0 && code == COND_EXPR)
6383         continue;
6384
6385       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6386                                           &def_stmt, &dts[i], &tem);
6387       dt = dts[i];
6388       gcc_assert (is_simple_use);
6389       if (dt == vect_reduction_def)
6390         {
6391           reduc_def_stmt = def_stmt;
6392           reduc_index = i;
6393           continue;
6394         }
6395       else if (tem)
6396         {
6397           /* To properly compute ncopies we are interested in the widest
6398              input type in case we're looking at a widening accumulation.  */
6399           if (!vectype_in
6400               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6401                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6402             vectype_in = tem;
6403         }
6404
6405       if (dt != vect_internal_def
6406           && dt != vect_external_def
6407           && dt != vect_constant_def
6408           && dt != vect_induction_def
6409           && !(dt == vect_nested_cycle && nested_cycle))
6410         return false;
6411
6412       if (dt == vect_nested_cycle)
6413         {
6414           found_nested_cycle_def = true;
6415           reduc_def_stmt = def_stmt;
6416           reduc_index = i;
6417         }
6418
6419       if (i == 1 && code == COND_EXPR)
6420         {
6421           /* Record how value of COND_EXPR is defined.  */
6422           if (dt == vect_constant_def)
6423             {
6424               cond_reduc_dt = dt;
6425               cond_reduc_val = ops[i];
6426             }
6427           if (dt == vect_induction_def
6428               && def_stmt != NULL
6429               && is_nonwrapping_integer_induction (def_stmt, loop))
6430             {
6431               cond_reduc_dt = dt;
6432               cond_reduc_def_stmt = def_stmt;
6433             }
6434         }
6435     }
6436
6437   if (!vectype_in)
6438     vectype_in = vectype_out;
6439
6440   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6441      directy used in stmt.  */
6442   if (reduc_index == -1)
6443     {
6444       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6445         {
6446           if (dump_enabled_p ())
6447             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6448                              "in-order reduction chain without SLP.\n");
6449           return false;
6450         }
6451
6452       if (orig_stmt)
6453         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6454       else
6455         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6456     }
6457
6458   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6459     return false;
6460
6461   if (!(reduc_index == -1
6462         || dts[reduc_index] == vect_reduction_def
6463         || dts[reduc_index] == vect_nested_cycle
6464         || ((dts[reduc_index] == vect_internal_def
6465              || dts[reduc_index] == vect_external_def
6466              || dts[reduc_index] == vect_constant_def
6467              || dts[reduc_index] == vect_induction_def)
6468             && nested_cycle && found_nested_cycle_def)))
6469     {
6470       /* For pattern recognized stmts, orig_stmt might be a reduction,
6471          but some helper statements for the pattern might not, or
6472          might be COND_EXPRs with reduction uses in the condition.  */
6473       gcc_assert (orig_stmt);
6474       return false;
6475     }
6476
6477   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6478   enum vect_reduction_type v_reduc_type
6479     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6480   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6481
6482   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6483   /* If we have a condition reduction, see if we can simplify it further.  */
6484   if (v_reduc_type == COND_REDUCTION)
6485     {
6486       /* TODO: We can't yet handle reduction chains, since we need to treat
6487          each COND_EXPR in the chain specially, not just the last one.
6488          E.g. for:
6489
6490             x_1 = PHI <x_3, ...>
6491             x_2 = a_2 ? ... : x_1;
6492             x_3 = a_3 ? ... : x_2;
6493
6494          we're interested in the last element in x_3 for which a_2 || a_3
6495          is true, whereas the current reduction chain handling would
6496          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6497          as a reduction operation.  */
6498       if (reduc_index == -1)
6499         {
6500           if (dump_enabled_p ())
6501             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6502                              "conditional reduction chains not supported\n");
6503           return false;
6504         }
6505
6506       /* vect_is_simple_reduction ensured that operand 2 is the
6507          loop-carried operand.  */
6508       gcc_assert (reduc_index == 2);
6509
6510       /* Loop peeling modifies initial value of reduction PHI, which
6511          makes the reduction stmt to be transformed different to the
6512          original stmt analyzed.  We need to record reduction code for
6513          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6514          it can be used directly at transform stage.  */
6515       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6516           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6517         {
6518           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6519           gcc_assert (cond_reduc_dt == vect_constant_def);
6520           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6521         }
6522       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6523                                                vectype_in, OPTIMIZE_FOR_SPEED))
6524         {
6525           if (dump_enabled_p ())
6526             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6527                              "optimizing condition reduction with"
6528                              " FOLD_EXTRACT_LAST.\n");
6529           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6530         }
6531       else if (cond_reduc_dt == vect_induction_def)
6532         {
6533           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6534           tree base
6535             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6536           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6537
6538           gcc_assert (TREE_CODE (base) == INTEGER_CST
6539                       && TREE_CODE (step) == INTEGER_CST);
6540           cond_reduc_val = NULL_TREE;
6541           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6542              above base; punt if base is the minimum value of the type for
6543              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6544           if (tree_int_cst_sgn (step) == -1)
6545             {
6546               cond_reduc_op_code = MIN_EXPR;
6547               if (tree_int_cst_sgn (base) == -1)
6548                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6549               else if (tree_int_cst_lt (base,
6550                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6551                 cond_reduc_val
6552                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6553             }
6554           else
6555             {
6556               cond_reduc_op_code = MAX_EXPR;
6557               if (tree_int_cst_sgn (base) == 1)
6558                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6559               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6560                                         base))
6561                 cond_reduc_val
6562                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6563             }
6564           if (cond_reduc_val)
6565             {
6566               if (dump_enabled_p ())
6567                 dump_printf_loc (MSG_NOTE, vect_location,
6568                                  "condition expression based on "
6569                                  "integer induction.\n");
6570               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6571                 = INTEGER_INDUC_COND_REDUCTION;
6572             }
6573         }
6574       else if (cond_reduc_dt == vect_constant_def)
6575         {
6576           enum vect_def_type cond_initial_dt;
6577           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6578           tree cond_initial_val
6579             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6580
6581           gcc_assert (cond_reduc_val != NULL_TREE);
6582           vect_is_simple_use (cond_initial_val, loop_vinfo,
6583                               &def_stmt, &cond_initial_dt);
6584           if (cond_initial_dt == vect_constant_def
6585               && types_compatible_p (TREE_TYPE (cond_initial_val),
6586                                      TREE_TYPE (cond_reduc_val)))
6587             {
6588               tree e = fold_binary (LE_EXPR, boolean_type_node,
6589                                     cond_initial_val, cond_reduc_val);
6590               if (e && (integer_onep (e) || integer_zerop (e)))
6591                 {
6592                   if (dump_enabled_p ())
6593                     dump_printf_loc (MSG_NOTE, vect_location,
6594                                      "condition expression based on "
6595                                      "compile time constant.\n");
6596                   /* Record reduction code at analysis stage.  */
6597                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6598                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6599                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6600                     = CONST_COND_REDUCTION;
6601                 }
6602             }
6603         }
6604     }
6605
6606   if (orig_stmt)
6607     gcc_assert (tmp == orig_stmt
6608                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6609                     == orig_stmt));
6610   else
6611     /* We changed STMT to be the first stmt in reduction chain, hence we
6612        check that in this case the first element in the chain is STMT.  */
6613     gcc_assert (stmt == tmp
6614                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6615
6616   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6617     return false;
6618
6619   if (slp_node)
6620     ncopies = 1;
6621   else
6622     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6623
6624   gcc_assert (ncopies >= 1);
6625
6626   vec_mode = TYPE_MODE (vectype_in);
6627   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6628
6629   if (code == COND_EXPR)
6630     {
6631       /* Only call during the analysis stage, otherwise we'll lose
6632          STMT_VINFO_TYPE.  */
6633       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6634                                                 ops[reduc_index], 0, NULL,
6635                                                 cost_vec))
6636         {
6637           if (dump_enabled_p ())
6638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6639                              "unsupported condition in reduction\n");
6640           return false;
6641         }
6642     }
6643   else
6644     {
6645       /* 4. Supportable by target?  */
6646
6647       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6648           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6649         {
6650           /* Shifts and rotates are only supported by vectorizable_shifts,
6651              not vectorizable_reduction.  */
6652           if (dump_enabled_p ())
6653             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6654                              "unsupported shift or rotation.\n");
6655           return false;
6656         }
6657
6658       /* 4.1. check support for the operation in the loop  */
6659       optab = optab_for_tree_code (code, vectype_in, optab_default);
6660       if (!optab)
6661         {
6662           if (dump_enabled_p ())
6663             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6664                              "no optab.\n");
6665
6666           return false;
6667         }
6668
6669       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6670         {
6671           if (dump_enabled_p ())
6672             dump_printf (MSG_NOTE, "op not supported by target.\n");
6673
6674           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6675               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6676             return false;
6677
6678           if (dump_enabled_p ())
6679             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6680         }
6681
6682       /* Worthwhile without SIMD support?  */
6683       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6684           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6685         {
6686           if (dump_enabled_p ())
6687             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6688                              "not worthwhile without SIMD support.\n");
6689
6690           return false;
6691         }
6692     }
6693
6694   /* 4.2. Check support for the epilog operation.
6695
6696           If STMT represents a reduction pattern, then the type of the
6697           reduction variable may be different than the type of the rest
6698           of the arguments.  For example, consider the case of accumulation
6699           of shorts into an int accumulator; The original code:
6700                         S1: int_a = (int) short_a;
6701           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6702
6703           was replaced with:
6704                         STMT: int_acc = widen_sum <short_a, int_acc>
6705
6706           This means that:
6707           1. The tree-code that is used to create the vector operation in the
6708              epilog code (that reduces the partial results) is not the
6709              tree-code of STMT, but is rather the tree-code of the original
6710              stmt from the pattern that STMT is replacing.  I.e, in the example
6711              above we want to use 'widen_sum' in the loop, but 'plus' in the
6712              epilog.
6713           2. The type (mode) we use to check available target support
6714              for the vector operation to be created in the *epilog*, is
6715              determined by the type of the reduction variable (in the example
6716              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6717              However the type (mode) we use to check available target support
6718              for the vector operation to be created *inside the loop*, is
6719              determined by the type of the other arguments to STMT (in the
6720              example we'd check this: optab_handler (widen_sum_optab,
6721              vect_short_mode)).
6722
6723           This is contrary to "regular" reductions, in which the types of all
6724           the arguments are the same as the type of the reduction variable.
6725           For "regular" reductions we can therefore use the same vector type
6726           (and also the same tree-code) when generating the epilog code and
6727           when generating the code inside the loop.  */
6728
6729   vect_reduction_type reduction_type
6730     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6731   if (orig_stmt
6732       && (reduction_type == TREE_CODE_REDUCTION
6733           || reduction_type == FOLD_LEFT_REDUCTION))
6734     {
6735       /* This is a reduction pattern: get the vectype from the type of the
6736          reduction variable, and get the tree-code from orig_stmt.  */
6737       orig_code = gimple_assign_rhs_code (orig_stmt);
6738       gcc_assert (vectype_out);
6739       vec_mode = TYPE_MODE (vectype_out);
6740     }
6741   else
6742     {
6743       /* Regular reduction: use the same vectype and tree-code as used for
6744          the vector code inside the loop can be used for the epilog code. */
6745       orig_code = code;
6746
6747       if (code == MINUS_EXPR)
6748         orig_code = PLUS_EXPR;
6749
6750       /* For simple condition reductions, replace with the actual expression
6751          we want to base our reduction around.  */
6752       if (reduction_type == CONST_COND_REDUCTION)
6753         {
6754           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6755           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6756         }
6757       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6758         orig_code = cond_reduc_op_code;
6759     }
6760
6761   if (nested_cycle)
6762     {
6763       def_bb = gimple_bb (reduc_def_stmt);
6764       def_stmt_loop = def_bb->loop_father;
6765       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6766                                        loop_preheader_edge (def_stmt_loop));
6767       if (TREE_CODE (def_arg) == SSA_NAME
6768           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6769           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6770           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6771           && vinfo_for_stmt (def_arg_stmt)
6772           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6773               == vect_double_reduction_def)
6774         double_reduc = true;
6775     }
6776
6777   reduc_fn = IFN_LAST;
6778
6779   if (reduction_type == TREE_CODE_REDUCTION
6780       || reduction_type == FOLD_LEFT_REDUCTION
6781       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6782       || reduction_type == CONST_COND_REDUCTION)
6783     {
6784       if (reduction_type == FOLD_LEFT_REDUCTION
6785           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6786           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6787         {
6788           if (reduc_fn != IFN_LAST
6789               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6790                                                   OPTIMIZE_FOR_SPEED))
6791             {
6792               if (dump_enabled_p ())
6793                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6794                                  "reduc op not supported by target.\n");
6795
6796               reduc_fn = IFN_LAST;
6797             }
6798         }
6799       else
6800         {
6801           if (!nested_cycle || double_reduc)
6802             {
6803               if (dump_enabled_p ())
6804                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6805                                  "no reduc code for scalar code.\n");
6806
6807               return false;
6808             }
6809         }
6810     }
6811   else if (reduction_type == COND_REDUCTION)
6812     {
6813       int scalar_precision
6814         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6815       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6816       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6817                                                 nunits_out);
6818
6819       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6820                                           OPTIMIZE_FOR_SPEED))
6821         reduc_fn = IFN_REDUC_MAX;
6822     }
6823
6824   if (reduction_type != EXTRACT_LAST_REDUCTION
6825       && reduc_fn == IFN_LAST
6826       && !nunits_out.is_constant ())
6827     {
6828       if (dump_enabled_p ())
6829         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6830                          "missing target support for reduction on"
6831                          " variable-length vectors.\n");
6832       return false;
6833     }
6834
6835   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6836       && ncopies > 1)
6837     {
6838       if (dump_enabled_p ())
6839         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6840                          "multiple types in double reduction or condition "
6841                          "reduction.\n");
6842       return false;
6843     }
6844
6845   /* For SLP reductions, see if there is a neutral value we can use.  */
6846   tree neutral_op = NULL_TREE;
6847   if (slp_node)
6848     neutral_op = neutral_op_for_slp_reduction
6849                    (slp_node_instance->reduc_phis, code,
6850                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6851
6852   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6853     {
6854       /* We can't support in-order reductions of code such as this:
6855
6856            for (int i = 0; i < n1; ++i)
6857              for (int j = 0; j < n2; ++j)
6858                l += a[j];
6859
6860          since GCC effectively transforms the loop when vectorizing:
6861
6862            for (int i = 0; i < n1 / VF; ++i)
6863              for (int j = 0; j < n2; ++j)
6864                for (int k = 0; k < VF; ++k)
6865                  l += a[j];
6866
6867          which is a reassociation of the original operation.  */
6868       if (dump_enabled_p ())
6869         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6870                          "in-order double reduction not supported.\n");
6871
6872       return false;
6873     }
6874
6875   if (reduction_type == FOLD_LEFT_REDUCTION
6876       && slp_node
6877       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6878     {
6879       /* We cannot use in-order reductions in this case because there is
6880          an implicit reassociation of the operations involved.  */
6881       if (dump_enabled_p ())
6882         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6883                          "in-order unchained SLP reductions not supported.\n");
6884       return false;
6885     }
6886
6887   /* For double reductions, and for SLP reductions with a neutral value,
6888      we construct a variable-length initial vector by loading a vector
6889      full of the neutral value and then shift-and-inserting the start
6890      values into the low-numbered elements.  */
6891   if ((double_reduc || neutral_op)
6892       && !nunits_out.is_constant ()
6893       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6894                                           vectype_out, OPTIMIZE_FOR_SPEED))
6895     {
6896       if (dump_enabled_p ())
6897         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898                          "reduction on variable-length vectors requires"
6899                          " target support for a vector-shift-and-insert"
6900                          " operation.\n");
6901       return false;
6902     }
6903
6904   /* Check extra constraints for variable-length unchained SLP reductions.  */
6905   if (STMT_SLP_TYPE (stmt_info)
6906       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6907       && !nunits_out.is_constant ())
6908     {
6909       /* We checked above that we could build the initial vector when
6910          there's a neutral element value.  Check here for the case in
6911          which each SLP statement has its own initial value and in which
6912          that value needs to be repeated for every instance of the
6913          statement within the initial vector.  */
6914       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6915       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6916       if (!neutral_op
6917           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6918         {
6919           if (dump_enabled_p ())
6920             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6921                              "unsupported form of SLP reduction for"
6922                              " variable-length vectors: cannot build"
6923                              " initial vector.\n");
6924           return false;
6925         }
6926       /* The epilogue code relies on the number of elements being a multiple
6927          of the group size.  The duplicate-and-interleave approach to setting
6928          up the the initial vector does too.  */
6929       if (!multiple_p (nunits_out, group_size))
6930         {
6931           if (dump_enabled_p ())
6932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6933                              "unsupported form of SLP reduction for"
6934                              " variable-length vectors: the vector size"
6935                              " is not a multiple of the number of results.\n");
6936           return false;
6937         }
6938     }
6939
6940   /* In case of widenning multiplication by a constant, we update the type
6941      of the constant to be the type of the other operand.  We check that the
6942      constant fits the type in the pattern recognition pass.  */
6943   if (code == DOT_PROD_EXPR
6944       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6945     {
6946       if (TREE_CODE (ops[0]) == INTEGER_CST)
6947         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6948       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6949         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6950       else
6951         {
6952           if (dump_enabled_p ())
6953             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6954                              "invalid types in dot-prod\n");
6955
6956           return false;
6957         }
6958     }
6959
6960   if (reduction_type == COND_REDUCTION)
6961     {
6962       widest_int ni;
6963
6964       if (! max_loop_iterations (loop, &ni))
6965         {
6966           if (dump_enabled_p ())
6967             dump_printf_loc (MSG_NOTE, vect_location,
6968                              "loop count not known, cannot create cond "
6969                              "reduction.\n");
6970           return false;
6971         }
6972       /* Convert backedges to iterations.  */
6973       ni += 1;
6974
6975       /* The additional index will be the same type as the condition.  Check
6976          that the loop can fit into this less one (because we'll use up the
6977          zero slot for when there are no matches).  */
6978       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6979       if (wi::geu_p (ni, wi::to_widest (max_index)))
6980         {
6981           if (dump_enabled_p ())
6982             dump_printf_loc (MSG_NOTE, vect_location,
6983                              "loop size is greater than data size.\n");
6984           return false;
6985         }
6986     }
6987
6988   /* In case the vectorization factor (VF) is bigger than the number
6989      of elements that we can fit in a vectype (nunits), we have to generate
6990      more than one vector stmt - i.e - we need to "unroll" the
6991      vector stmt by a factor VF/nunits.  For more details see documentation
6992      in vectorizable_operation.  */
6993
6994   /* If the reduction is used in an outer loop we need to generate
6995      VF intermediate results, like so (e.g. for ncopies=2):
6996         r0 = phi (init, r0)
6997         r1 = phi (init, r1)
6998         r0 = x0 + r0;
6999         r1 = x1 + r1;
7000     (i.e. we generate VF results in 2 registers).
7001     In this case we have a separate def-use cycle for each copy, and therefore
7002     for each copy we get the vector def for the reduction variable from the
7003     respective phi node created for this copy.
7004
7005     Otherwise (the reduction is unused in the loop nest), we can combine
7006     together intermediate results, like so (e.g. for ncopies=2):
7007         r = phi (init, r)
7008         r = x0 + r;
7009         r = x1 + r;
7010    (i.e. we generate VF/2 results in a single register).
7011    In this case for each copy we get the vector def for the reduction variable
7012    from the vectorized reduction operation generated in the previous iteration.
7013
7014    This only works when we see both the reduction PHI and its only consumer
7015    in vectorizable_reduction and there are no intermediate stmts
7016    participating.  */
7017   use_operand_p use_p;
7018   gimple *use_stmt;
7019   if (ncopies > 1
7020       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7021       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7022       && (use_stmt == stmt
7023           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7024     {
7025       single_defuse_cycle = true;
7026       epilog_copies = 1;
7027     }
7028   else
7029     epilog_copies = ncopies;
7030
7031   /* If the reduction stmt is one of the patterns that have lane
7032      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7033   if ((ncopies > 1
7034        && ! single_defuse_cycle)
7035       && (code == DOT_PROD_EXPR
7036           || code == WIDEN_SUM_EXPR
7037           || code == SAD_EXPR))
7038     {
7039       if (dump_enabled_p ())
7040         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7041                          "multi def-use cycle not possible for lane-reducing "
7042                          "reduction operation\n");
7043       return false;
7044     }
7045
7046   if (slp_node)
7047     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7048   else
7049     vec_num = 1;
7050
7051   internal_fn cond_fn = get_conditional_internal_fn (code);
7052   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7053
7054   if (!vec_stmt) /* transformation not required.  */
7055     {
7056       if (first_p)
7057         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7058       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7059         {
7060           if (reduction_type != FOLD_LEFT_REDUCTION
7061               && (cond_fn == IFN_LAST
7062                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7063                                                       OPTIMIZE_FOR_SPEED)))
7064             {
7065               if (dump_enabled_p ())
7066                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7067                                  "can't use a fully-masked loop because no"
7068                                  " conditional operation is available.\n");
7069               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7070             }
7071           else if (reduc_index == -1)
7072             {
7073               if (dump_enabled_p ())
7074                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7075                                  "can't use a fully-masked loop for chained"
7076                                  " reductions.\n");
7077               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7078             }
7079           else
7080             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7081                                    vectype_in);
7082         }
7083       if (dump_enabled_p ()
7084           && reduction_type == FOLD_LEFT_REDUCTION)
7085         dump_printf_loc (MSG_NOTE, vect_location,
7086                          "using an in-order (fold-left) reduction.\n");
7087       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7088       return true;
7089     }
7090
7091   /* Transform.  */
7092
7093   if (dump_enabled_p ())
7094     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7095
7096   /* FORNOW: Multiple types are not supported for condition.  */
7097   if (code == COND_EXPR)
7098     gcc_assert (ncopies == 1);
7099
7100   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7101
7102   if (reduction_type == FOLD_LEFT_REDUCTION)
7103     return vectorize_fold_left_reduction
7104       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7105        reduc_fn, ops, vectype_in, reduc_index, masks);
7106
7107   if (reduction_type == EXTRACT_LAST_REDUCTION)
7108     {
7109       gcc_assert (!slp_node);
7110       return vectorizable_condition (stmt, gsi, vec_stmt,
7111                                      NULL, reduc_index, NULL, NULL);
7112     }
7113
7114   /* Create the destination vector  */
7115   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7116
7117   prev_stmt_info = NULL;
7118   prev_phi_info = NULL;
7119   if (!slp_node)
7120     {
7121       vec_oprnds0.create (1);
7122       vec_oprnds1.create (1);
7123       if (op_type == ternary_op)
7124         vec_oprnds2.create (1);
7125     }
7126
7127   phis.create (vec_num);
7128   vect_defs.create (vec_num);
7129   if (!slp_node)
7130     vect_defs.quick_push (NULL_TREE);
7131
7132   if (slp_node)
7133     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7134   else
7135     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7136
7137   for (j = 0; j < ncopies; j++)
7138     {
7139       if (code == COND_EXPR)
7140         {
7141           gcc_assert (!slp_node);
7142           vectorizable_condition (stmt, gsi, vec_stmt,
7143                                   PHI_RESULT (phis[0]),
7144                                   reduc_index, NULL, NULL);
7145           /* Multiple types are not supported for condition.  */
7146           break;
7147         }
7148
7149       /* Handle uses.  */
7150       if (j == 0)
7151         {
7152           if (slp_node)
7153             {
7154               /* Get vec defs for all the operands except the reduction index,
7155                  ensuring the ordering of the ops in the vector is kept.  */
7156               auto_vec<tree, 3> slp_ops;
7157               auto_vec<vec<tree>, 3> vec_defs;
7158
7159               slp_ops.quick_push (ops[0]);
7160               slp_ops.quick_push (ops[1]);
7161               if (op_type == ternary_op)
7162                 slp_ops.quick_push (ops[2]);
7163
7164               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7165
7166               vec_oprnds0.safe_splice (vec_defs[0]);
7167               vec_defs[0].release ();
7168               vec_oprnds1.safe_splice (vec_defs[1]);
7169               vec_defs[1].release ();
7170               if (op_type == ternary_op)
7171                 {
7172                   vec_oprnds2.safe_splice (vec_defs[2]);
7173                   vec_defs[2].release ();
7174                 }
7175             }
7176           else
7177             {
7178               vec_oprnds0.quick_push
7179                 (vect_get_vec_def_for_operand (ops[0], stmt));
7180               vec_oprnds1.quick_push
7181                 (vect_get_vec_def_for_operand (ops[1], stmt));
7182               if (op_type == ternary_op)
7183                 vec_oprnds2.quick_push
7184                   (vect_get_vec_def_for_operand (ops[2], stmt));
7185             }
7186         }
7187       else
7188         {
7189           if (!slp_node)
7190             {
7191               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7192
7193               if (single_defuse_cycle && reduc_index == 0)
7194                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7195               else
7196                 vec_oprnds0[0]
7197                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7198               if (single_defuse_cycle && reduc_index == 1)
7199                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7200               else
7201                 vec_oprnds1[0]
7202                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7203               if (op_type == ternary_op)
7204                 {
7205                   if (single_defuse_cycle && reduc_index == 2)
7206                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7207                   else
7208                     vec_oprnds2[0]
7209                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7210                 }
7211             }
7212         }
7213
7214       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7215         {
7216           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7217           if (masked_loop_p)
7218             {
7219               /* Make sure that the reduction accumulator is vop[0].  */
7220               if (reduc_index == 1)
7221                 {
7222                   gcc_assert (commutative_tree_code (code));
7223                   std::swap (vop[0], vop[1]);
7224                 }
7225               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7226                                               vectype_in, i * ncopies + j);
7227               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7228                                                         vop[0], vop[1],
7229                                                         vop[0]);
7230               new_temp = make_ssa_name (vec_dest, call);
7231               gimple_call_set_lhs (call, new_temp);
7232               gimple_call_set_nothrow (call, true);
7233               new_stmt = call;
7234             }
7235           else
7236             {
7237               if (op_type == ternary_op)
7238                 vop[2] = vec_oprnds2[i];
7239
7240               new_temp = make_ssa_name (vec_dest, new_stmt);
7241               new_stmt = gimple_build_assign (new_temp, code,
7242                                               vop[0], vop[1], vop[2]);
7243             }
7244           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7245
7246           if (slp_node)
7247             {
7248               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7249               vect_defs.quick_push (new_temp);
7250             }
7251           else
7252             vect_defs[0] = new_temp;
7253         }
7254
7255       if (slp_node)
7256         continue;
7257
7258       if (j == 0)
7259         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7260       else
7261         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7262
7263       prev_stmt_info = vinfo_for_stmt (new_stmt);
7264     }
7265
7266   /* Finalize the reduction-phi (set its arguments) and create the
7267      epilog reduction code.  */
7268   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7269     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7270
7271   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7272                                     epilog_copies, reduc_fn, phis,
7273                                     double_reduc, slp_node, slp_node_instance,
7274                                     cond_reduc_val, cond_reduc_op_code,
7275                                     neutral_op);
7276
7277   return true;
7278 }
7279
7280 /* Function vect_min_worthwhile_factor.
7281
7282    For a loop where we could vectorize the operation indicated by CODE,
7283    return the minimum vectorization factor that makes it worthwhile
7284    to use generic vectors.  */
7285 static unsigned int
7286 vect_min_worthwhile_factor (enum tree_code code)
7287 {
7288   switch (code)
7289     {
7290     case PLUS_EXPR:
7291     case MINUS_EXPR:
7292     case NEGATE_EXPR:
7293       return 4;
7294
7295     case BIT_AND_EXPR:
7296     case BIT_IOR_EXPR:
7297     case BIT_XOR_EXPR:
7298     case BIT_NOT_EXPR:
7299       return 2;
7300
7301     default:
7302       return INT_MAX;
7303     }
7304 }
7305
7306 /* Return true if VINFO indicates we are doing loop vectorization and if
7307    it is worth decomposing CODE operations into scalar operations for
7308    that loop's vectorization factor.  */
7309
7310 bool
7311 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7312 {
7313   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7314   unsigned HOST_WIDE_INT value;
7315   return (loop_vinfo
7316           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7317           && value >= vect_min_worthwhile_factor (code));
7318 }
7319
7320 /* Function vectorizable_induction
7321
7322    Check if PHI performs an induction computation that can be vectorized.
7323    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7324    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7325    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7326
7327 bool
7328 vectorizable_induction (gimple *phi,
7329                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7330                         gimple **vec_stmt, slp_tree slp_node,
7331                         stmt_vector_for_cost *cost_vec)
7332 {
7333   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7334   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7335   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7336   unsigned ncopies;
7337   bool nested_in_vect_loop = false;
7338   struct loop *iv_loop;
7339   tree vec_def;
7340   edge pe = loop_preheader_edge (loop);
7341   basic_block new_bb;
7342   tree new_vec, vec_init, vec_step, t;
7343   tree new_name;
7344   gimple *new_stmt;
7345   gphi *induction_phi;
7346   tree induc_def, vec_dest;
7347   tree init_expr, step_expr;
7348   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7349   unsigned i;
7350   tree expr;
7351   gimple_seq stmts;
7352   imm_use_iterator imm_iter;
7353   use_operand_p use_p;
7354   gimple *exit_phi;
7355   edge latch_e;
7356   tree loop_arg;
7357   gimple_stmt_iterator si;
7358   basic_block bb = gimple_bb (phi);
7359
7360   if (gimple_code (phi) != GIMPLE_PHI)
7361     return false;
7362
7363   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7364     return false;
7365
7366   /* Make sure it was recognized as induction computation.  */
7367   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7368     return false;
7369
7370   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7371   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7372
7373   if (slp_node)
7374     ncopies = 1;
7375   else
7376     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7377   gcc_assert (ncopies >= 1);
7378
7379   /* FORNOW. These restrictions should be relaxed.  */
7380   if (nested_in_vect_loop_p (loop, phi))
7381     {
7382       imm_use_iterator imm_iter;
7383       use_operand_p use_p;
7384       gimple *exit_phi;
7385       edge latch_e;
7386       tree loop_arg;
7387
7388       if (ncopies > 1)
7389         {
7390           if (dump_enabled_p ())
7391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7392                              "multiple types in nested loop.\n");
7393           return false;
7394         }
7395
7396       /* FORNOW: outer loop induction with SLP not supported.  */
7397       if (STMT_SLP_TYPE (stmt_info))
7398         return false;
7399
7400       exit_phi = NULL;
7401       latch_e = loop_latch_edge (loop->inner);
7402       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7403       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7404         {
7405           gimple *use_stmt = USE_STMT (use_p);
7406           if (is_gimple_debug (use_stmt))
7407             continue;
7408
7409           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7410             {
7411               exit_phi = use_stmt;
7412               break;
7413             }
7414         }
7415       if (exit_phi)
7416         {
7417           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7418           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7419                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7420             {
7421               if (dump_enabled_p ())
7422                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7423                                  "inner-loop induction only used outside "
7424                                  "of the outer vectorized loop.\n");
7425               return false;
7426             }
7427         }
7428
7429       nested_in_vect_loop = true;
7430       iv_loop = loop->inner;
7431     }
7432   else
7433     iv_loop = loop;
7434   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7435
7436   if (slp_node && !nunits.is_constant ())
7437     {
7438       /* The current SLP code creates the initial value element-by-element.  */
7439       if (dump_enabled_p ())
7440         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7441                          "SLP induction not supported for variable-length"
7442                          " vectors.\n");
7443       return false;
7444     }
7445
7446   if (!vec_stmt) /* transformation not required.  */
7447     {
7448       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7449       if (dump_enabled_p ())
7450         dump_printf_loc (MSG_NOTE, vect_location,
7451                          "=== vectorizable_induction ===\n");
7452       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7453       return true;
7454     }
7455
7456   /* Transform.  */
7457
7458   /* Compute a vector variable, initialized with the first VF values of
7459      the induction variable.  E.g., for an iv with IV_PHI='X' and
7460      evolution S, for a vector of 4 units, we want to compute:
7461      [X, X + S, X + 2*S, X + 3*S].  */
7462
7463   if (dump_enabled_p ())
7464     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7465
7466   latch_e = loop_latch_edge (iv_loop);
7467   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7468
7469   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7470   gcc_assert (step_expr != NULL_TREE);
7471
7472   pe = loop_preheader_edge (iv_loop);
7473   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7474                                      loop_preheader_edge (iv_loop));
7475
7476   stmts = NULL;
7477   if (!nested_in_vect_loop)
7478     {
7479       /* Convert the initial value to the desired type.  */
7480       tree new_type = TREE_TYPE (vectype);
7481       init_expr = gimple_convert (&stmts, new_type, init_expr);
7482
7483       /* If we are using the loop mask to "peel" for alignment then we need
7484          to adjust the start value here.  */
7485       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7486       if (skip_niters != NULL_TREE)
7487         {
7488           if (FLOAT_TYPE_P (vectype))
7489             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7490                                         skip_niters);
7491           else
7492             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7493           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7494                                          skip_niters, step_expr);
7495           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7496                                     init_expr, skip_step);
7497         }
7498     }
7499
7500   /* Convert the step to the desired type.  */
7501   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7502
7503   if (stmts)
7504     {
7505       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7506       gcc_assert (!new_bb);
7507     }
7508
7509   /* Find the first insertion point in the BB.  */
7510   si = gsi_after_labels (bb);
7511
7512   /* For SLP induction we have to generate several IVs as for example
7513      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7514      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7515      [VF*S, VF*S, VF*S, VF*S] for all.  */
7516   if (slp_node)
7517     {
7518       /* Enforced above.  */
7519       unsigned int const_nunits = nunits.to_constant ();
7520
7521       /* Generate [VF*S, VF*S, ... ].  */
7522       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7523         {
7524           expr = build_int_cst (integer_type_node, vf);
7525           expr = fold_convert (TREE_TYPE (step_expr), expr);
7526         }
7527       else
7528         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7529       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7530                               expr, step_expr);
7531       if (! CONSTANT_CLASS_P (new_name))
7532         new_name = vect_init_vector (phi, new_name,
7533                                      TREE_TYPE (step_expr), NULL);
7534       new_vec = build_vector_from_val (vectype, new_name);
7535       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7536
7537       /* Now generate the IVs.  */
7538       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7539       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7540       unsigned elts = const_nunits * nvects;
7541       unsigned nivs = least_common_multiple (group_size,
7542                                              const_nunits) / const_nunits;
7543       gcc_assert (elts % group_size == 0);
7544       tree elt = init_expr;
7545       unsigned ivn;
7546       for (ivn = 0; ivn < nivs; ++ivn)
7547         {
7548           tree_vector_builder elts (vectype, const_nunits, 1);
7549           stmts = NULL;
7550           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7551             {
7552               if (ivn*const_nunits + eltn >= group_size
7553                   && (ivn * const_nunits + eltn) % group_size == 0)
7554                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7555                                     elt, step_expr);
7556               elts.quick_push (elt);
7557             }
7558           vec_init = gimple_build_vector (&stmts, &elts);
7559           if (stmts)
7560             {
7561               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7562               gcc_assert (!new_bb);
7563             }
7564
7565           /* Create the induction-phi that defines the induction-operand.  */
7566           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7567           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7568           set_vinfo_for_stmt (induction_phi,
7569                               new_stmt_vec_info (induction_phi, loop_vinfo));
7570           induc_def = PHI_RESULT (induction_phi);
7571
7572           /* Create the iv update inside the loop  */
7573           vec_def = make_ssa_name (vec_dest);
7574           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7575           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7576           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7577
7578           /* Set the arguments of the phi node:  */
7579           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7580           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7581                        UNKNOWN_LOCATION);
7582
7583           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7584         }
7585
7586       /* Re-use IVs when we can.  */
7587       if (ivn < nvects)
7588         {
7589           unsigned vfp
7590             = least_common_multiple (group_size, const_nunits) / group_size;
7591           /* Generate [VF'*S, VF'*S, ... ].  */
7592           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7593             {
7594               expr = build_int_cst (integer_type_node, vfp);
7595               expr = fold_convert (TREE_TYPE (step_expr), expr);
7596             }
7597           else
7598             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7599           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7600                                   expr, step_expr);
7601           if (! CONSTANT_CLASS_P (new_name))
7602             new_name = vect_init_vector (phi, new_name,
7603                                          TREE_TYPE (step_expr), NULL);
7604           new_vec = build_vector_from_val (vectype, new_name);
7605           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7606           for (; ivn < nvects; ++ivn)
7607             {
7608               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7609               tree def;
7610               if (gimple_code (iv) == GIMPLE_PHI)
7611                 def = gimple_phi_result (iv);
7612               else
7613                 def = gimple_assign_lhs (iv);
7614               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7615                                               PLUS_EXPR,
7616                                               def, vec_step);
7617               if (gimple_code (iv) == GIMPLE_PHI)
7618                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7619               else
7620                 {
7621                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7622                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7623                 }
7624               set_vinfo_for_stmt (new_stmt,
7625                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7626               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7627             }
7628         }
7629
7630       return true;
7631     }
7632
7633   /* Create the vector that holds the initial_value of the induction.  */
7634   if (nested_in_vect_loop)
7635     {
7636       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7637          been created during vectorization of previous stmts.  We obtain it
7638          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7639       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7640       /* If the initial value is not of proper type, convert it.  */
7641       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7642         {
7643           new_stmt
7644             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7645                                                           vect_simple_var,
7646                                                           "vec_iv_"),
7647                                    VIEW_CONVERT_EXPR,
7648                                    build1 (VIEW_CONVERT_EXPR, vectype,
7649                                            vec_init));
7650           vec_init = gimple_assign_lhs (new_stmt);
7651           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7652                                                  new_stmt);
7653           gcc_assert (!new_bb);
7654           set_vinfo_for_stmt (new_stmt,
7655                               new_stmt_vec_info (new_stmt, loop_vinfo));
7656         }
7657     }
7658   else
7659     {
7660       /* iv_loop is the loop to be vectorized. Create:
7661          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7662       stmts = NULL;
7663       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7664
7665       unsigned HOST_WIDE_INT const_nunits;
7666       if (nunits.is_constant (&const_nunits))
7667         {
7668           tree_vector_builder elts (vectype, const_nunits, 1);
7669           elts.quick_push (new_name);
7670           for (i = 1; i < const_nunits; i++)
7671             {
7672               /* Create: new_name_i = new_name + step_expr  */
7673               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7674                                        new_name, step_expr);
7675               elts.quick_push (new_name);
7676             }
7677           /* Create a vector from [new_name_0, new_name_1, ...,
7678              new_name_nunits-1]  */
7679           vec_init = gimple_build_vector (&stmts, &elts);
7680         }
7681       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7682         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7683         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7684                                  new_name, step_expr);
7685       else
7686         {
7687           /* Build:
7688                 [base, base, base, ...]
7689                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7690           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7691           gcc_assert (flag_associative_math);
7692           tree index = build_index_vector (vectype, 0, 1);
7693           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7694                                                         new_name);
7695           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7696                                                         step_expr);
7697           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7698           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7699                                    vec_init, step_vec);
7700           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7701                                    vec_init, base_vec);
7702         }
7703
7704       if (stmts)
7705         {
7706           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7707           gcc_assert (!new_bb);
7708         }
7709     }
7710
7711
7712   /* Create the vector that holds the step of the induction.  */
7713   if (nested_in_vect_loop)
7714     /* iv_loop is nested in the loop to be vectorized. Generate:
7715        vec_step = [S, S, S, S]  */
7716     new_name = step_expr;
7717   else
7718     {
7719       /* iv_loop is the loop to be vectorized. Generate:
7720           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7721       gimple_seq seq = NULL;
7722       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7723         {
7724           expr = build_int_cst (integer_type_node, vf);
7725           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7726         }
7727       else
7728         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7729       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7730                                expr, step_expr);
7731       if (seq)
7732         {
7733           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7734           gcc_assert (!new_bb);
7735         }
7736     }
7737
7738   t = unshare_expr (new_name);
7739   gcc_assert (CONSTANT_CLASS_P (new_name)
7740               || TREE_CODE (new_name) == SSA_NAME);
7741   new_vec = build_vector_from_val (vectype, t);
7742   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7743
7744
7745   /* Create the following def-use cycle:
7746      loop prolog:
7747          vec_init = ...
7748          vec_step = ...
7749      loop:
7750          vec_iv = PHI <vec_init, vec_loop>
7751          ...
7752          STMT
7753          ...
7754          vec_loop = vec_iv + vec_step;  */
7755
7756   /* Create the induction-phi that defines the induction-operand.  */
7757   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7758   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7759   set_vinfo_for_stmt (induction_phi,
7760                       new_stmt_vec_info (induction_phi, loop_vinfo));
7761   induc_def = PHI_RESULT (induction_phi);
7762
7763   /* Create the iv update inside the loop  */
7764   vec_def = make_ssa_name (vec_dest);
7765   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7766   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7767   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7768
7769   /* Set the arguments of the phi node:  */
7770   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7771   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7772                UNKNOWN_LOCATION);
7773
7774   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7775
7776   /* In case that vectorization factor (VF) is bigger than the number
7777      of elements that we can fit in a vectype (nunits), we have to generate
7778      more than one vector stmt - i.e - we need to "unroll" the
7779      vector stmt by a factor VF/nunits.  For more details see documentation
7780      in vectorizable_operation.  */
7781
7782   if (ncopies > 1)
7783     {
7784       gimple_seq seq = NULL;
7785       stmt_vec_info prev_stmt_vinfo;
7786       /* FORNOW. This restriction should be relaxed.  */
7787       gcc_assert (!nested_in_vect_loop);
7788
7789       /* Create the vector that holds the step of the induction.  */
7790       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7791         {
7792           expr = build_int_cst (integer_type_node, nunits);
7793           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7794         }
7795       else
7796         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7797       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7798                                expr, step_expr);
7799       if (seq)
7800         {
7801           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7802           gcc_assert (!new_bb);
7803         }
7804
7805       t = unshare_expr (new_name);
7806       gcc_assert (CONSTANT_CLASS_P (new_name)
7807                   || TREE_CODE (new_name) == SSA_NAME);
7808       new_vec = build_vector_from_val (vectype, t);
7809       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7810
7811       vec_def = induc_def;
7812       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7813       for (i = 1; i < ncopies; i++)
7814         {
7815           /* vec_i = vec_prev + vec_step  */
7816           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7817                                           vec_def, vec_step);
7818           vec_def = make_ssa_name (vec_dest, new_stmt);
7819           gimple_assign_set_lhs (new_stmt, vec_def);
7820
7821           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7822           set_vinfo_for_stmt (new_stmt,
7823                               new_stmt_vec_info (new_stmt, loop_vinfo));
7824           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7825           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7826         }
7827     }
7828
7829   if (nested_in_vect_loop)
7830     {
7831       /* Find the loop-closed exit-phi of the induction, and record
7832          the final vector of induction results:  */
7833       exit_phi = NULL;
7834       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7835         {
7836           gimple *use_stmt = USE_STMT (use_p);
7837           if (is_gimple_debug (use_stmt))
7838             continue;
7839
7840           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7841             {
7842               exit_phi = use_stmt;
7843               break;
7844             }
7845         }
7846       if (exit_phi)
7847         {
7848           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7849           /* FORNOW. Currently not supporting the case that an inner-loop induction
7850              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7851           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7852                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7853
7854           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7855           if (dump_enabled_p ())
7856             {
7857               dump_printf_loc (MSG_NOTE, vect_location,
7858                                "vector of inductions after inner-loop:");
7859               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7860             }
7861         }
7862     }
7863
7864
7865   if (dump_enabled_p ())
7866     {
7867       dump_printf_loc (MSG_NOTE, vect_location,
7868                        "transform induction: created def-use cycle: ");
7869       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7870       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7871                         SSA_NAME_DEF_STMT (vec_def), 0);
7872     }
7873
7874   return true;
7875 }
7876
7877 /* Function vectorizable_live_operation.
7878
7879    STMT computes a value that is used outside the loop.  Check if
7880    it can be supported.  */
7881
7882 bool
7883 vectorizable_live_operation (gimple *stmt,
7884                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7885                              slp_tree slp_node, int slp_index,
7886                              gimple **vec_stmt,
7887                              stmt_vector_for_cost *)
7888 {
7889   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7890   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7891   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7892   imm_use_iterator imm_iter;
7893   tree lhs, lhs_type, bitsize, vec_bitsize;
7894   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7895   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7896   int ncopies;
7897   gimple *use_stmt;
7898   auto_vec<tree> vec_oprnds;
7899   int vec_entry = 0;
7900   poly_uint64 vec_index = 0;
7901
7902   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7903
7904   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7905     return false;
7906
7907   /* FORNOW.  CHECKME.  */
7908   if (nested_in_vect_loop_p (loop, stmt))
7909     return false;
7910
7911   /* If STMT is not relevant and it is a simple assignment and its inputs are
7912      invariant then it can remain in place, unvectorized.  The original last
7913      scalar value that it computes will be used.  */
7914   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7915     {
7916       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7917       if (dump_enabled_p ())
7918         dump_printf_loc (MSG_NOTE, vect_location,
7919                          "statement is simple and uses invariant.  Leaving in "
7920                          "place.\n");
7921       return true;
7922     }
7923
7924   if (slp_node)
7925     ncopies = 1;
7926   else
7927     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7928
7929   if (slp_node)
7930     {
7931       gcc_assert (slp_index >= 0);
7932
7933       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7934       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7935
7936       /* Get the last occurrence of the scalar index from the concatenation of
7937          all the slp vectors. Calculate which slp vector it is and the index
7938          within.  */
7939       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7940
7941       /* Calculate which vector contains the result, and which lane of
7942          that vector we need.  */
7943       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7944         {
7945           if (dump_enabled_p ())
7946             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7947                              "Cannot determine which vector holds the"
7948                              " final result.\n");
7949           return false;
7950         }
7951     }
7952
7953   if (!vec_stmt)
7954     {
7955       /* No transformation required.  */
7956       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7957         {
7958           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7959                                                OPTIMIZE_FOR_SPEED))
7960             {
7961               if (dump_enabled_p ())
7962                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7963                                  "can't use a fully-masked loop because "
7964                                  "the target doesn't support extract last "
7965                                  "reduction.\n");
7966               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7967             }
7968           else if (slp_node)
7969             {
7970               if (dump_enabled_p ())
7971                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7972                                  "can't use a fully-masked loop because an "
7973                                  "SLP statement is live after the loop.\n");
7974               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7975             }
7976           else if (ncopies > 1)
7977             {
7978               if (dump_enabled_p ())
7979                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7980                                  "can't use a fully-masked loop because"
7981                                  " ncopies is greater than 1.\n");
7982               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7983             }
7984           else
7985             {
7986               gcc_assert (ncopies == 1 && !slp_node);
7987               vect_record_loop_mask (loop_vinfo,
7988                                      &LOOP_VINFO_MASKS (loop_vinfo),
7989                                      1, vectype);
7990             }
7991         }
7992       return true;
7993     }
7994
7995   /* If stmt has a related stmt, then use that for getting the lhs.  */
7996   if (is_pattern_stmt_p (stmt_info))
7997     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7998
7999   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8000         : gimple_get_lhs (stmt);
8001   lhs_type = TREE_TYPE (lhs);
8002
8003   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8004              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8005              : TYPE_SIZE (TREE_TYPE (vectype)));
8006   vec_bitsize = TYPE_SIZE (vectype);
8007
8008   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8009   tree vec_lhs, bitstart;
8010   if (slp_node)
8011     {
8012       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8013
8014       /* Get the correct slp vectorized stmt.  */
8015       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8016       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8017         vec_lhs = gimple_phi_result (phi);
8018       else
8019         vec_lhs = gimple_get_lhs (vec_stmt);
8020
8021       /* Get entry to use.  */
8022       bitstart = bitsize_int (vec_index);
8023       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8024     }
8025   else
8026     {
8027       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8028       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8029       gcc_checking_assert (ncopies == 1
8030                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8031
8032       /* For multiple copies, get the last copy.  */
8033       for (int i = 1; i < ncopies; ++i)
8034         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8035                                                   vec_lhs);
8036
8037       /* Get the last lane in the vector.  */
8038       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8039     }
8040
8041   gimple_seq stmts = NULL;
8042   tree new_tree;
8043   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8044     {
8045       /* Emit:
8046
8047            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8048
8049          where VEC_LHS is the vectorized live-out result and MASK is
8050          the loop mask for the final iteration.  */
8051       gcc_assert (ncopies == 1 && !slp_node);
8052       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8053       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8054                                       1, vectype, 0);
8055       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8056                                       scalar_type, mask, vec_lhs);
8057
8058       /* Convert the extracted vector element to the required scalar type.  */
8059       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8060     }
8061   else
8062     {
8063       tree bftype = TREE_TYPE (vectype);
8064       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8065         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8066       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8067       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8068                                        &stmts, true, NULL_TREE);
8069     }
8070
8071   if (stmts)
8072     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8073
8074   /* Replace use of lhs with newly computed result.  If the use stmt is a
8075      single arg PHI, just replace all uses of PHI result.  It's necessary
8076      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8077   use_operand_p use_p;
8078   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8079     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8080         && !is_gimple_debug (use_stmt))
8081     {
8082       if (gimple_code (use_stmt) == GIMPLE_PHI
8083           && gimple_phi_num_args (use_stmt) == 1)
8084         {
8085           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8086         }
8087       else
8088         {
8089           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8090             SET_USE (use_p, new_tree);
8091         }
8092       update_stmt (use_stmt);
8093     }
8094
8095   return true;
8096 }
8097
8098 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8099
8100 static void
8101 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8102 {
8103   ssa_op_iter op_iter;
8104   imm_use_iterator imm_iter;
8105   def_operand_p def_p;
8106   gimple *ustmt;
8107
8108   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8109     {
8110       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8111         {
8112           basic_block bb;
8113
8114           if (!is_gimple_debug (ustmt))
8115             continue;
8116
8117           bb = gimple_bb (ustmt);
8118
8119           if (!flow_bb_inside_loop_p (loop, bb))
8120             {
8121               if (gimple_debug_bind_p (ustmt))
8122                 {
8123                   if (dump_enabled_p ())
8124                     dump_printf_loc (MSG_NOTE, vect_location,
8125                                      "killing debug use\n");
8126
8127                   gimple_debug_bind_reset_value (ustmt);
8128                   update_stmt (ustmt);
8129                 }
8130               else
8131                 gcc_unreachable ();
8132             }
8133         }
8134     }
8135 }
8136
8137 /* Given loop represented by LOOP_VINFO, return true if computation of
8138    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8139    otherwise.  */
8140
8141 static bool
8142 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8143 {
8144   /* Constant case.  */
8145   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8146     {
8147       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8148       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8149
8150       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8151       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8152       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8153         return true;
8154     }
8155
8156   widest_int max;
8157   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8158   /* Check the upper bound of loop niters.  */
8159   if (get_max_loop_iterations (loop, &max))
8160     {
8161       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8162       signop sgn = TYPE_SIGN (type);
8163       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8164       if (max < type_max)
8165         return true;
8166     }
8167   return false;
8168 }
8169
8170 /* Return a mask type with half the number of elements as TYPE.  */
8171
8172 tree
8173 vect_halve_mask_nunits (tree type)
8174 {
8175   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8176   return build_truth_vector_type (nunits, current_vector_size);
8177 }
8178
8179 /* Return a mask type with twice as many elements as TYPE.  */
8180
8181 tree
8182 vect_double_mask_nunits (tree type)
8183 {
8184   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8185   return build_truth_vector_type (nunits, current_vector_size);
8186 }
8187
8188 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8189    contain a sequence of NVECTORS masks that each control a vector of type
8190    VECTYPE.  */
8191
8192 void
8193 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8194                        unsigned int nvectors, tree vectype)
8195 {
8196   gcc_assert (nvectors != 0);
8197   if (masks->length () < nvectors)
8198     masks->safe_grow_cleared (nvectors);
8199   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8200   /* The number of scalars per iteration and the number of vectors are
8201      both compile-time constants.  */
8202   unsigned int nscalars_per_iter
8203     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8204                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8205   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8206     {
8207       rgm->max_nscalars_per_iter = nscalars_per_iter;
8208       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8209     }
8210 }
8211
8212 /* Given a complete set of masks MASKS, extract mask number INDEX
8213    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8214    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8215
8216    See the comment above vec_loop_masks for more details about the mask
8217    arrangement.  */
8218
8219 tree
8220 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8221                     unsigned int nvectors, tree vectype, unsigned int index)
8222 {
8223   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8224   tree mask_type = rgm->mask_type;
8225
8226   /* Populate the rgroup's mask array, if this is the first time we've
8227      used it.  */
8228   if (rgm->masks.is_empty ())
8229     {
8230       rgm->masks.safe_grow_cleared (nvectors);
8231       for (unsigned int i = 0; i < nvectors; ++i)
8232         {
8233           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8234           /* Provide a dummy definition until the real one is available.  */
8235           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8236           rgm->masks[i] = mask;
8237         }
8238     }
8239
8240   tree mask = rgm->masks[index];
8241   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8242                 TYPE_VECTOR_SUBPARTS (vectype)))
8243     {
8244       /* A loop mask for data type X can be reused for data type Y
8245          if X has N times more elements than Y and if Y's elements
8246          are N times bigger than X's.  In this case each sequence
8247          of N elements in the loop mask will be all-zero or all-one.
8248          We can then view-convert the mask so that each sequence of
8249          N elements is replaced by a single element.  */
8250       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8251                               TYPE_VECTOR_SUBPARTS (vectype)));
8252       gimple_seq seq = NULL;
8253       mask_type = build_same_sized_truth_vector_type (vectype);
8254       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8255       if (seq)
8256         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8257     }
8258   return mask;
8259 }
8260
8261 /* Scale profiling counters by estimation for LOOP which is vectorized
8262    by factor VF.  */
8263
8264 static void
8265 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8266 {
8267   edge preheader = loop_preheader_edge (loop);
8268   /* Reduce loop iterations by the vectorization factor.  */
8269   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8270   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8271
8272   if (freq_h.nonzero_p ())
8273     {
8274       profile_probability p;
8275
8276       /* Avoid dropping loop body profile counter to 0 because of zero count
8277          in loop's preheader.  */
8278       if (!(freq_e == profile_count::zero ()))
8279         freq_e = freq_e.force_nonzero ();
8280       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8281       scale_loop_frequencies (loop, p);
8282     }
8283
8284   edge exit_e = single_exit (loop);
8285   exit_e->probability = profile_probability::always ()
8286                                  .apply_scale (1, new_est_niter + 1);
8287
8288   edge exit_l = single_pred_edge (loop->latch);
8289   profile_probability prob = exit_l->probability;
8290   exit_l->probability = exit_e->probability.invert ();
8291   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8292     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8293 }
8294
8295 /* Function vect_transform_loop.
8296
8297    The analysis phase has determined that the loop is vectorizable.
8298    Vectorize the loop - created vectorized stmts to replace the scalar
8299    stmts in the loop, and update the loop exit condition.
8300    Returns scalar epilogue loop if any.  */
8301
8302 struct loop *
8303 vect_transform_loop (loop_vec_info loop_vinfo)
8304 {
8305   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8306   struct loop *epilogue = NULL;
8307   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8308   int nbbs = loop->num_nodes;
8309   int i;
8310   tree niters_vector = NULL_TREE;
8311   tree step_vector = NULL_TREE;
8312   tree niters_vector_mult_vf = NULL_TREE;
8313   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8314   unsigned int lowest_vf = constant_lower_bound (vf);
8315   bool grouped_store;
8316   bool slp_scheduled = false;
8317   gimple *stmt, *pattern_stmt;
8318   gimple_seq pattern_def_seq = NULL;
8319   gimple_stmt_iterator pattern_def_si = gsi_none ();
8320   bool transform_pattern_stmt = false;
8321   bool check_profitability = false;
8322   unsigned int th;
8323
8324   if (dump_enabled_p ())
8325     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8326
8327   /* Use the more conservative vectorization threshold.  If the number
8328      of iterations is constant assume the cost check has been performed
8329      by our caller.  If the threshold makes all loops profitable that
8330      run at least the (estimated) vectorization factor number of times
8331      checking is pointless, too.  */
8332   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8333   if (th >= vect_vf_for_cost (loop_vinfo)
8334       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8335     {
8336       if (dump_enabled_p ())
8337         dump_printf_loc (MSG_NOTE, vect_location,
8338                          "Profitability threshold is %d loop iterations.\n",
8339                          th);
8340       check_profitability = true;
8341     }
8342
8343   /* Make sure there exists a single-predecessor exit bb.  Do this before
8344      versioning.   */
8345   edge e = single_exit (loop);
8346   if (! single_pred_p (e->dest))
8347     {
8348       split_loop_exit_edge (e);
8349       if (dump_enabled_p ())
8350         dump_printf (MSG_NOTE, "split exit edge\n");
8351     }
8352
8353   /* Version the loop first, if required, so the profitability check
8354      comes first.  */
8355
8356   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8357     {
8358       poly_uint64 versioning_threshold
8359         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8360       if (check_profitability
8361           && ordered_p (poly_uint64 (th), versioning_threshold))
8362         {
8363           versioning_threshold = ordered_max (poly_uint64 (th),
8364                                               versioning_threshold);
8365           check_profitability = false;
8366         }
8367       vect_loop_versioning (loop_vinfo, th, check_profitability,
8368                             versioning_threshold);
8369       check_profitability = false;
8370     }
8371
8372   /* Make sure there exists a single-predecessor exit bb also on the
8373      scalar loop copy.  Do this after versioning but before peeling
8374      so CFG structure is fine for both scalar and if-converted loop
8375      to make slpeel_duplicate_current_defs_from_edges face matched
8376      loop closed PHI nodes on the exit.  */
8377   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8378     {
8379       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8380       if (! single_pred_p (e->dest))
8381         {
8382           split_loop_exit_edge (e);
8383           if (dump_enabled_p ())
8384             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8385         }
8386     }
8387
8388   tree niters = vect_build_loop_niters (loop_vinfo);
8389   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8390   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8391   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8392   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8393                               &step_vector, &niters_vector_mult_vf, th,
8394                               check_profitability, niters_no_overflow);
8395
8396   if (niters_vector == NULL_TREE)
8397     {
8398       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8399           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8400           && known_eq (lowest_vf, vf))
8401         {
8402           niters_vector
8403             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8404                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8405           step_vector = build_one_cst (TREE_TYPE (niters));
8406         }
8407       else
8408         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8409                                      &step_vector, niters_no_overflow);
8410     }
8411
8412   /* 1) Make sure the loop header has exactly two entries
8413      2) Make sure we have a preheader basic block.  */
8414
8415   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8416
8417   split_edge (loop_preheader_edge (loop));
8418
8419   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8420       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8421     /* This will deal with any possible peeling.  */
8422     vect_prepare_for_masked_peels (loop_vinfo);
8423
8424   /* FORNOW: the vectorizer supports only loops which body consist
8425      of one basic block (header + empty latch). When the vectorizer will
8426      support more involved loop forms, the order by which the BBs are
8427      traversed need to be reconsidered.  */
8428
8429   for (i = 0; i < nbbs; i++)
8430     {
8431       basic_block bb = bbs[i];
8432       stmt_vec_info stmt_info;
8433
8434       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8435            gsi_next (&si))
8436         {
8437           gphi *phi = si.phi ();
8438           if (dump_enabled_p ())
8439             {
8440               dump_printf_loc (MSG_NOTE, vect_location,
8441                                "------>vectorizing phi: ");
8442               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8443             }
8444           stmt_info = vinfo_for_stmt (phi);
8445           if (!stmt_info)
8446             continue;
8447
8448           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8449             vect_loop_kill_debug_uses (loop, phi);
8450
8451           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8452               && !STMT_VINFO_LIVE_P (stmt_info))
8453             continue;
8454
8455           if (STMT_VINFO_VECTYPE (stmt_info)
8456               && (maybe_ne
8457                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8458               && dump_enabled_p ())
8459             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8460
8461           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8462                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8463                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8464               && ! PURE_SLP_STMT (stmt_info))
8465             {
8466               if (dump_enabled_p ())
8467                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8468               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8469             }
8470         }
8471
8472       pattern_stmt = NULL;
8473       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8474            !gsi_end_p (si) || transform_pattern_stmt;)
8475         {
8476           bool is_store;
8477
8478           if (transform_pattern_stmt)
8479             stmt = pattern_stmt;
8480           else
8481             {
8482               stmt = gsi_stmt (si);
8483               /* During vectorization remove existing clobber stmts.  */
8484               if (gimple_clobber_p (stmt))
8485                 {
8486                   unlink_stmt_vdef (stmt);
8487                   gsi_remove (&si, true);
8488                   release_defs (stmt);
8489                   continue;
8490                 }
8491             }
8492
8493           if (dump_enabled_p ())
8494             {
8495               dump_printf_loc (MSG_NOTE, vect_location,
8496                                "------>vectorizing statement: ");
8497               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8498             }
8499
8500           stmt_info = vinfo_for_stmt (stmt);
8501
8502           /* vector stmts created in the outer-loop during vectorization of
8503              stmts in an inner-loop may not have a stmt_info, and do not
8504              need to be vectorized.  */
8505           if (!stmt_info)
8506             {
8507               gsi_next (&si);
8508               continue;
8509             }
8510
8511           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8512             vect_loop_kill_debug_uses (loop, stmt);
8513
8514           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8515               && !STMT_VINFO_LIVE_P (stmt_info))
8516             {
8517               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8518                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8519                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8520                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8521                 {
8522                   stmt = pattern_stmt;
8523                   stmt_info = vinfo_for_stmt (stmt);
8524                 }
8525               else
8526                 {
8527                   gsi_next (&si);
8528                   continue;
8529                 }
8530             }
8531           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8532                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8533                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8534                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8535             transform_pattern_stmt = true;
8536
8537           /* If pattern statement has def stmts, vectorize them too.  */
8538           if (is_pattern_stmt_p (stmt_info))
8539             {
8540               if (pattern_def_seq == NULL)
8541                 {
8542                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8543                   pattern_def_si = gsi_start (pattern_def_seq);
8544                 }
8545               else if (!gsi_end_p (pattern_def_si))
8546                 gsi_next (&pattern_def_si);
8547               if (pattern_def_seq != NULL)
8548                 {
8549                   gimple *pattern_def_stmt = NULL;
8550                   stmt_vec_info pattern_def_stmt_info = NULL;
8551
8552                   while (!gsi_end_p (pattern_def_si))
8553                     {
8554                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8555                       pattern_def_stmt_info
8556                         = vinfo_for_stmt (pattern_def_stmt);
8557                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8558                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8559                         break;
8560                       gsi_next (&pattern_def_si);
8561                     }
8562
8563                   if (!gsi_end_p (pattern_def_si))
8564                     {
8565                       if (dump_enabled_p ())
8566                         {
8567                           dump_printf_loc (MSG_NOTE, vect_location,
8568                                            "==> vectorizing pattern def "
8569                                            "stmt: ");
8570                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8571                                             pattern_def_stmt, 0);
8572                         }
8573
8574                       stmt = pattern_def_stmt;
8575                       stmt_info = pattern_def_stmt_info;
8576                     }
8577                   else
8578                     {
8579                       pattern_def_si = gsi_none ();
8580                       transform_pattern_stmt = false;
8581                     }
8582                 }
8583               else
8584                 transform_pattern_stmt = false;
8585             }
8586
8587           if (STMT_VINFO_VECTYPE (stmt_info))
8588             {
8589               poly_uint64 nunits
8590                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8591               if (!STMT_SLP_TYPE (stmt_info)
8592                   && maybe_ne (nunits, vf)
8593                   && dump_enabled_p ())
8594                   /* For SLP VF is set according to unrolling factor, and not
8595                      to vector size, hence for SLP this print is not valid.  */
8596                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8597             }
8598
8599           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8600              reached.  */
8601           if (STMT_SLP_TYPE (stmt_info))
8602             {
8603               if (!slp_scheduled)
8604                 {
8605                   slp_scheduled = true;
8606
8607                   if (dump_enabled_p ())
8608                     dump_printf_loc (MSG_NOTE, vect_location,
8609                                      "=== scheduling SLP instances ===\n");
8610
8611                   vect_schedule_slp (loop_vinfo);
8612                 }
8613
8614               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8615               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8616                 {
8617                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8618                     {
8619                       pattern_def_seq = NULL;
8620                       gsi_next (&si);
8621                     }
8622                   continue;
8623                 }
8624             }
8625
8626           /* -------- vectorize statement ------------ */
8627           if (dump_enabled_p ())
8628             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8629
8630           grouped_store = false;
8631           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8632           if (is_store)
8633             {
8634               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8635                 {
8636                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8637                      interleaving chain was completed - free all the stores in
8638                      the chain.  */
8639                   gsi_next (&si);
8640                   vect_remove_stores (DR_GROUP_FIRST_ELEMENT (stmt_info));
8641                 }
8642               else
8643                 {
8644                   /* Free the attached stmt_vec_info and remove the stmt.  */
8645                   gimple *store = gsi_stmt (si);
8646                   free_stmt_vec_info (store);
8647                   unlink_stmt_vdef (store);
8648                   gsi_remove (&si, true);
8649                   release_defs (store);
8650                 }
8651
8652               /* Stores can only appear at the end of pattern statements.  */
8653               gcc_assert (!transform_pattern_stmt);
8654               pattern_def_seq = NULL;
8655             }
8656           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8657             {
8658               pattern_def_seq = NULL;
8659               gsi_next (&si);
8660             }
8661         }                       /* stmts in BB */
8662
8663       /* Stub out scalar statements that must not survive vectorization.
8664          Doing this here helps with grouped statements, or statements that
8665          are involved in patterns.  */
8666       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8667            !gsi_end_p (gsi); gsi_next (&gsi))
8668         {
8669           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8670           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8671             {
8672               tree lhs = gimple_get_lhs (call);
8673               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8674                 {
8675                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8676                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8677                   gsi_replace (&gsi, new_stmt, true);
8678                 }
8679             }
8680         }
8681     }                           /* BBs in loop */
8682
8683   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8684      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8685   if (integer_onep (step_vector))
8686     niters_no_overflow = true;
8687   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8688                            niters_vector_mult_vf, !niters_no_overflow);
8689
8690   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8691   scale_profile_for_vect_loop (loop, assumed_vf);
8692
8693   /* True if the final iteration might not handle a full vector's
8694      worth of scalar iterations.  */
8695   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8696   /* The minimum number of iterations performed by the epilogue.  This
8697      is 1 when peeling for gaps because we always need a final scalar
8698      iteration.  */
8699   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8700   /* +1 to convert latch counts to loop iteration counts,
8701      -min_epilogue_iters to remove iterations that cannot be performed
8702        by the vector code.  */
8703   int bias_for_lowest = 1 - min_epilogue_iters;
8704   int bias_for_assumed = bias_for_lowest;
8705   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8706   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8707     {
8708       /* When the amount of peeling is known at compile time, the first
8709          iteration will have exactly alignment_npeels active elements.
8710          In the worst case it will have at least one.  */
8711       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8712       bias_for_lowest += lowest_vf - min_first_active;
8713       bias_for_assumed += assumed_vf - min_first_active;
8714     }
8715   /* In these calculations the "- 1" converts loop iteration counts
8716      back to latch counts.  */
8717   if (loop->any_upper_bound)
8718     loop->nb_iterations_upper_bound
8719       = (final_iter_may_be_partial
8720          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8721                           lowest_vf) - 1
8722          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8723                            lowest_vf) - 1);
8724   if (loop->any_likely_upper_bound)
8725     loop->nb_iterations_likely_upper_bound
8726       = (final_iter_may_be_partial
8727          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8728                           + bias_for_lowest, lowest_vf) - 1
8729          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8730                            + bias_for_lowest, lowest_vf) - 1);
8731   if (loop->any_estimate)
8732     loop->nb_iterations_estimate
8733       = (final_iter_may_be_partial
8734          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8735                           assumed_vf) - 1
8736          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8737                            assumed_vf) - 1);
8738
8739   if (dump_enabled_p ())
8740     {
8741       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8742         {
8743           dump_printf_loc (MSG_NOTE, vect_location,
8744                            "LOOP VECTORIZED\n");
8745           if (loop->inner)
8746             dump_printf_loc (MSG_NOTE, vect_location,
8747                              "OUTER LOOP VECTORIZED\n");
8748           dump_printf (MSG_NOTE, "\n");
8749         }
8750       else
8751         {
8752           dump_printf_loc (MSG_NOTE, vect_location,
8753                            "LOOP EPILOGUE VECTORIZED (VS=");
8754           dump_dec (MSG_NOTE, current_vector_size);
8755           dump_printf (MSG_NOTE, ")\n");
8756         }
8757     }
8758
8759   /* Free SLP instances here because otherwise stmt reference counting
8760      won't work.  */
8761   slp_instance instance;
8762   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8763     vect_free_slp_instance (instance);
8764   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8765   /* Clear-up safelen field since its value is invalid after vectorization
8766      since vectorized loop can have loop-carried dependencies.  */
8767   loop->safelen = 0;
8768
8769   /* Don't vectorize epilogue for epilogue.  */
8770   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8771     epilogue = NULL;
8772
8773   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8774     epilogue = NULL;
8775
8776   if (epilogue)
8777     {
8778       auto_vector_sizes vector_sizes;
8779       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8780       unsigned int next_size = 0;
8781
8782       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8783           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8784           && known_eq (vf, lowest_vf))
8785         {
8786           unsigned int eiters
8787             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8788                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8789           eiters = eiters % lowest_vf;
8790           epilogue->nb_iterations_upper_bound = eiters - 1;
8791
8792           unsigned int ratio;
8793           while (next_size < vector_sizes.length ()
8794                  && !(constant_multiple_p (current_vector_size,
8795                                            vector_sizes[next_size], &ratio)
8796                       && eiters >= lowest_vf / ratio))
8797             next_size += 1;
8798         }
8799       else
8800         while (next_size < vector_sizes.length ()
8801                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8802           next_size += 1;
8803
8804       if (next_size == vector_sizes.length ())
8805         epilogue = NULL;
8806     }
8807
8808   if (epilogue)
8809     {
8810       epilogue->force_vectorize = loop->force_vectorize;
8811       epilogue->safelen = loop->safelen;
8812       epilogue->dont_vectorize = false;
8813
8814       /* We may need to if-convert epilogue to vectorize it.  */
8815       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8816         tree_if_conversion (epilogue);
8817     }
8818
8819   return epilogue;
8820 }
8821
8822 /* The code below is trying to perform simple optimization - revert
8823    if-conversion for masked stores, i.e. if the mask of a store is zero
8824    do not perform it and all stored value producers also if possible.
8825    For example,
8826      for (i=0; i<n; i++)
8827        if (c[i])
8828         {
8829           p1[i] += 1;
8830           p2[i] = p3[i] +2;
8831         }
8832    this transformation will produce the following semi-hammock:
8833
8834    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8835      {
8836        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8837        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8838        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8839        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8840        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8841        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8842      }
8843 */
8844
8845 void
8846 optimize_mask_stores (struct loop *loop)
8847 {
8848   basic_block *bbs = get_loop_body (loop);
8849   unsigned nbbs = loop->num_nodes;
8850   unsigned i;
8851   basic_block bb;
8852   struct loop *bb_loop;
8853   gimple_stmt_iterator gsi;
8854   gimple *stmt;
8855   auto_vec<gimple *> worklist;
8856
8857   vect_location = find_loop_location (loop);
8858   /* Pick up all masked stores in loop if any.  */
8859   for (i = 0; i < nbbs; i++)
8860     {
8861       bb = bbs[i];
8862       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8863            gsi_next (&gsi))
8864         {
8865           stmt = gsi_stmt (gsi);
8866           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8867             worklist.safe_push (stmt);
8868         }
8869     }
8870
8871   free (bbs);
8872   if (worklist.is_empty ())
8873     return;
8874
8875   /* Loop has masked stores.  */
8876   while (!worklist.is_empty ())
8877     {
8878       gimple *last, *last_store;
8879       edge e, efalse;
8880       tree mask;
8881       basic_block store_bb, join_bb;
8882       gimple_stmt_iterator gsi_to;
8883       tree vdef, new_vdef;
8884       gphi *phi;
8885       tree vectype;
8886       tree zero;
8887
8888       last = worklist.pop ();
8889       mask = gimple_call_arg (last, 2);
8890       bb = gimple_bb (last);
8891       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8892          the same loop as if_bb.  It could be different to LOOP when two
8893          level loop-nest is vectorized and mask_store belongs to the inner
8894          one.  */
8895       e = split_block (bb, last);
8896       bb_loop = bb->loop_father;
8897       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8898       join_bb = e->dest;
8899       store_bb = create_empty_bb (bb);
8900       add_bb_to_loop (store_bb, bb_loop);
8901       e->flags = EDGE_TRUE_VALUE;
8902       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8903       /* Put STORE_BB to likely part.  */
8904       efalse->probability = profile_probability::unlikely ();
8905       store_bb->count = efalse->count ();
8906       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8907       if (dom_info_available_p (CDI_DOMINATORS))
8908         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8909       if (dump_enabled_p ())
8910         dump_printf_loc (MSG_NOTE, vect_location,
8911                          "Create new block %d to sink mask stores.",
8912                          store_bb->index);
8913       /* Create vector comparison with boolean result.  */
8914       vectype = TREE_TYPE (mask);
8915       zero = build_zero_cst (vectype);
8916       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8917       gsi = gsi_last_bb (bb);
8918       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8919       /* Create new PHI node for vdef of the last masked store:
8920          .MEM_2 = VDEF <.MEM_1>
8921          will be converted to
8922          .MEM.3 = VDEF <.MEM_1>
8923          and new PHI node will be created in join bb
8924          .MEM_2 = PHI <.MEM_1, .MEM_3>
8925       */
8926       vdef = gimple_vdef (last);
8927       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8928       gimple_set_vdef (last, new_vdef);
8929       phi = create_phi_node (vdef, join_bb);
8930       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8931
8932       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8933       while (true)
8934         {
8935           gimple_stmt_iterator gsi_from;
8936           gimple *stmt1 = NULL;
8937
8938           /* Move masked store to STORE_BB.  */
8939           last_store = last;
8940           gsi = gsi_for_stmt (last);
8941           gsi_from = gsi;
8942           /* Shift GSI to the previous stmt for further traversal.  */
8943           gsi_prev (&gsi);
8944           gsi_to = gsi_start_bb (store_bb);
8945           gsi_move_before (&gsi_from, &gsi_to);
8946           /* Setup GSI_TO to the non-empty block start.  */
8947           gsi_to = gsi_start_bb (store_bb);
8948           if (dump_enabled_p ())
8949             {
8950               dump_printf_loc (MSG_NOTE, vect_location,
8951                                "Move stmt to created bb\n");
8952               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8953             }
8954           /* Move all stored value producers if possible.  */
8955           while (!gsi_end_p (gsi))
8956             {
8957               tree lhs;
8958               imm_use_iterator imm_iter;
8959               use_operand_p use_p;
8960               bool res;
8961
8962               /* Skip debug statements.  */
8963               if (is_gimple_debug (gsi_stmt (gsi)))
8964                 {
8965                   gsi_prev (&gsi);
8966                   continue;
8967                 }
8968               stmt1 = gsi_stmt (gsi);
8969               /* Do not consider statements writing to memory or having
8970                  volatile operand.  */
8971               if (gimple_vdef (stmt1)
8972                   || gimple_has_volatile_ops (stmt1))
8973                 break;
8974               gsi_from = gsi;
8975               gsi_prev (&gsi);
8976               lhs = gimple_get_lhs (stmt1);
8977               if (!lhs)
8978                 break;
8979
8980               /* LHS of vectorized stmt must be SSA_NAME.  */
8981               if (TREE_CODE (lhs) != SSA_NAME)
8982                 break;
8983
8984               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8985                 {
8986                   /* Remove dead scalar statement.  */
8987                   if (has_zero_uses (lhs))
8988                     {
8989                       gsi_remove (&gsi_from, true);
8990                       continue;
8991                     }
8992                 }
8993
8994               /* Check that LHS does not have uses outside of STORE_BB.  */
8995               res = true;
8996               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8997                 {
8998                   gimple *use_stmt;
8999                   use_stmt = USE_STMT (use_p);
9000                   if (is_gimple_debug (use_stmt))
9001                     continue;
9002                   if (gimple_bb (use_stmt) != store_bb)
9003                     {
9004                       res = false;
9005                       break;
9006                     }
9007                 }
9008               if (!res)
9009                 break;
9010
9011               if (gimple_vuse (stmt1)
9012                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9013                 break;
9014
9015               /* Can move STMT1 to STORE_BB.  */
9016               if (dump_enabled_p ())
9017                 {
9018                   dump_printf_loc (MSG_NOTE, vect_location,
9019                                    "Move stmt to created bb\n");
9020                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9021                 }
9022               gsi_move_before (&gsi_from, &gsi_to);
9023               /* Shift GSI_TO for further insertion.  */
9024               gsi_prev (&gsi_to);
9025             }
9026           /* Put other masked stores with the same mask to STORE_BB.  */
9027           if (worklist.is_empty ()
9028               || gimple_call_arg (worklist.last (), 2) != mask
9029               || worklist.last () != stmt1)
9030             break;
9031           last = worklist.pop ();
9032         }
9033       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9034     }
9035 }