gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156
 157 /* Function vect_determine_vectorization_factor
 158
 159    Determine the vectorization factor (VF).  VF is the number of data elements
 160    that are operated upon in parallel in a single iteration of the vectorized
 161    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 162    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 163    elements can fit in a single vector register.
 164
 165    We currently support vectorization of loops in which all types operated upon
 166    are of the same size.  Therefore this function currently sets VF according to
 167    the size of the types operated upon, and fails if there are multiple sizes
 168    in the loop.
 169
 170    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 171    original loop:
 172         for (i=0; i<N; i++){
 173           a[i] = b[i] + c[i];
 174         }
 175
 176    vectorized loop:
 177         for (i=0; i<N; i+=VF){
 178           a[i:VF] = b[i:VF] + c[i:VF];
 179         }
 180 */
 181
 182 static bool
 183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 187   unsigned nbbs = loop->num_nodes;
 188   poly_uint64 vectorization_factor = 1;
 189   tree scalar_type = NULL_TREE;
 190   gphi *phi;
 191   tree vectype;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               if (dump_enabled_p ())
 261                 {
 262                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 263                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 264                   dump_printf (MSG_NOTE, "\n");
 265                 }
 266
 267               vect_update_max_nunits (&vectorization_factor, vectype);
 268             }
 269         }
 270
 271       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 272            !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288             }
 289
 290           gcc_assert (stmt_info);
 291
 292           /* Skip stmts which do not need to be vectorized.  */
 293           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 294                && !STMT_VINFO_LIVE_P (stmt_info))
 295               || gimple_clobber_p (stmt))
 296             {
 297               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 298                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 299                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 300                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 301                 {
 302                   stmt = pattern_stmt;
 303                   stmt_info = vinfo_for_stmt (pattern_stmt);
 304                   if (dump_enabled_p ())
 305                     {
 306                       dump_printf_loc (MSG_NOTE, vect_location,
 307                                        "==> examining pattern statement: ");
 308                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 309                     }
 310                 }
 311               else
 312                 {
 313                   if (dump_enabled_p ())
 314                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 315                   gsi_next (&si);
 316                   continue;
 317                 }
 318             }
 319           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 320                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 321                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 322                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 323             analyze_pattern_stmt = true;
 324
 325           /* If a pattern statement has def stmts, analyze them too.  */
 326           if (is_pattern_stmt_p (stmt_info))
 327             {
 328               if (pattern_def_seq == NULL)
 329                 {
 330                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 331                   pattern_def_si = gsi_start (pattern_def_seq);
 332                 }
 333               else if (!gsi_end_p (pattern_def_si))
 334                 gsi_next (&pattern_def_si);
 335               if (pattern_def_seq != NULL)
 336                 {
 337                   gimple *pattern_def_stmt = NULL;
 338                   stmt_vec_info pattern_def_stmt_info = NULL;
 339
 340                   while (!gsi_end_p (pattern_def_si))
 341                     {
 342                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 343                       pattern_def_stmt_info
 344                         = vinfo_for_stmt (pattern_def_stmt);
 345                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 346                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 347                         break;
 348                       gsi_next (&pattern_def_si);
 349                     }
 350
 351                   if (!gsi_end_p (pattern_def_si))
 352                     {
 353                       if (dump_enabled_p ())
 354                         {
 355                           dump_printf_loc (MSG_NOTE, vect_location,
 356                                            "==> examining pattern def stmt: ");
 357                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 358                                             pattern_def_stmt, 0);
 359                         }
 360
 361                       stmt = pattern_def_stmt;
 362                       stmt_info = pattern_def_stmt_info;
 363                     }
 364                   else
 365                     {
 366                       pattern_def_si = gsi_none ();
 367                       analyze_pattern_stmt = false;
 368                     }
 369                 }
 370               else
 371                 analyze_pattern_stmt = false;
 372             }
 373
 374           if (gimple_get_lhs (stmt) == NULL_TREE
 375               /* MASK_STORE has no lhs, but is ok.  */
 376               && (!is_gimple_call (stmt)
 377                   || !gimple_call_internal_p (stmt)
 378                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 379             {
 380               if (is_gimple_call (stmt))
 381                 {
 382                   /* Ignore calls with no lhs.  These must be calls to
 383                      #pragma omp simd functions, and what vectorization factor
 384                      it really needs can't be determined until
 385                      vectorizable_simd_clone_call.  */
 386                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 387                     {
 388                       pattern_def_seq = NULL;
 389                       gsi_next (&si);
 390                     }
 391                   continue;
 392                 }
 393               if (dump_enabled_p ())
 394                 {
 395                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 396                                    "not vectorized: irregular stmt.");
 397                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 398                                     0);
 399                 }
 400               return false;
 401             }
 402
 403           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 404             {
 405               if (dump_enabled_p ())
 406                 {
 407                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                                    "not vectorized: vector stmt in loop:");
 409                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 431                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 432               else
 433                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 434
 435               /* Bool ops don't participate in vectorization factor
 436                  computation.  For comparison use compared types to
 437                  compute a factor.  */
 438               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 439                   && is_gimple_assign (stmt)
 440                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 441                 {
 442                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 443                       || STMT_VINFO_LIVE_P (stmt_info))
 444                     mask_producers.safe_push (stmt_info);
 445                   bool_result = true;
 446
 447                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 448                       == tcc_comparison
 449                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 450                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 451                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 452                   else
 453                     {
 454                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 455                         {
 456                           pattern_def_seq = NULL;
 457                           gsi_next (&si);
 458                         }
 459                       continue;
 460                     }
 461                 }
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location,
 466                                    "get vectype for scalar type:  ");
 467                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 468                   dump_printf (MSG_NOTE, "\n");
 469                 }
 470               vectype = get_vectype_for_scalar_type (scalar_type);
 471               if (!vectype)
 472                 {
 473                   if (dump_enabled_p ())
 474                     {
 475                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 476                                        "not vectorized: unsupported "
 477                                        "data-type ");
 478                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 479                                          scalar_type);
 480                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 481                     }
 482                   return false;
 483                 }
 484
 485               if (!bool_result)
 486                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 487
 488               if (dump_enabled_p ())
 489                 {
 490                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 491                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 492                   dump_printf (MSG_NOTE, "\n");
 493                 }
 494             }
 495
 496           /* Don't try to compute VF out scalar types if we stmt
 497              produces boolean vector.  Use result vectype instead.  */
 498           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 499             vf_vectype = vectype;
 500           else
 501             {
 502               /* The vectorization factor is according to the smallest
 503                  scalar type (or the largest vector size, but we only
 504                  support one vector size per loop).  */
 505               if (!bool_result)
 506                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                              &dummy);
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_NOTE, vect_location,
 511                                    "get vectype for scalar type:  ");
 512                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513                   dump_printf (MSG_NOTE, "\n");
 514                 }
 515               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516             }
 517           if (!vf_vectype)
 518             {
 519               if (dump_enabled_p ())
 520                 {
 521                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 522                                    "not vectorized: unsupported data-type ");
 523                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 524                                      scalar_type);
 525                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 526                 }
 527               return false;
 528             }
 529
 530           if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
 531                         GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 532             {
 533               if (dump_enabled_p ())
 534                 {
 535                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                                    "not vectorized: different sized vector "
 537                                    "types in statement, ");
 538                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 539                                      vectype);
 540                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 541                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 542                                      vf_vectype);
 543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 544                 }
 545               return false;
 546             }
 547
 548           if (dump_enabled_p ())
 549             {
 550               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 551               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 552               dump_printf (MSG_NOTE, "\n");
 553             }
 554
 555           if (dump_enabled_p ())
 556             {
 557               dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 558               dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
 559               dump_printf (MSG_NOTE, "\n");
 560             }
 561
 562           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 563
 564           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 565             {
 566               pattern_def_seq = NULL;
 567               gsi_next (&si);
 568             }
 569         }
 570     }
 571
 572   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 573   if (dump_enabled_p ())
 574     {
 575       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 576       dump_dec (MSG_NOTE, vectorization_factor);
 577       dump_printf (MSG_NOTE, "\n");
 578     }
 579
 580   if (known_le (vectorization_factor, 1U))
 581     {
 582       if (dump_enabled_p ())
 583         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 584                          "not vectorized: unsupported data-type\n");
 585       return false;
 586     }
 587   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 588
 589   for (i = 0; i < mask_producers.length (); i++)
 590     {
 591       tree mask_type = NULL;
 592
 593       stmt = STMT_VINFO_STMT (mask_producers[i]);
 594
 595       if (is_gimple_assign (stmt)
 596           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 597           && !VECT_SCALAR_BOOLEAN_TYPE_P
 598                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 599         {
 600           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 601           mask_type = get_mask_type_for_scalar_type (scalar_type);
 602
 603           if (!mask_type)
 604             {
 605               if (dump_enabled_p ())
 606                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                                  "not vectorized: unsupported mask\n");
 608               return false;
 609             }
 610         }
 611       else
 612         {
 613           tree rhs;
 614           ssa_op_iter iter;
 615           gimple *def_stmt;
 616           enum vect_def_type dt;
 617
 618           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 619             {
 620               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 621                                        &def_stmt, &dt, &vectype))
 622                 {
 623                   if (dump_enabled_p ())
 624                     {
 625                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 626                                        "not vectorized: can't compute mask type "
 627                                        "for statement, ");
 628                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 629                                         0);
 630                     }
 631                   return false;
 632                 }
 633
 634               /* No vectype probably means external definition.
 635                  Allow it in case there is another operand which
 636                  allows to determine mask type.  */
 637               if (!vectype)
 638                 continue;
 639
 640               if (!mask_type)
 641                 mask_type = vectype;
 642               else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
 643                                  TYPE_VECTOR_SUBPARTS (vectype)))
 644                 {
 645                   if (dump_enabled_p ())
 646                     {
 647                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 648                                        "not vectorized: different sized masks "
 649                                        "types in statement, ");
 650                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 651                                          mask_type);
 652                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 653                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 654                                          vectype);
 655                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 656                     }
 657                   return false;
 658                 }
 659               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 660                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 661                 {
 662                   if (dump_enabled_p ())
 663                     {
 664                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 665                                        "not vectorized: mixed mask and "
 666                                        "nonmask vector types in statement, ");
 667                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 668                                          mask_type);
 669                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 670                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 671                                          vectype);
 672                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 673                     }
 674                   return false;
 675                 }
 676             }
 677
 678           /* We may compare boolean value loaded as vector of integers.
 679              Fix mask_type in such case.  */
 680           if (mask_type
 681               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 682               && gimple_code (stmt) == GIMPLE_ASSIGN
 683               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 684             mask_type = build_same_sized_truth_vector_type (mask_type);
 685         }
 686
 687       /* No mask_type should mean loop invariant predicate.
 688          This is probably a subject for optimization in
 689          if-conversion.  */
 690       if (!mask_type)
 691         {
 692           if (dump_enabled_p ())
 693             {
 694               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 695                                "not vectorized: can't compute mask type "
 696                                "for statement, ");
 697               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 698                                 0);
 699             }
 700           return false;
 701         }
 702
 703       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 704     }
 705
 706   return true;
 707 }
 708
 709
 710 /* Function vect_is_simple_iv_evolution.
 711
 712    FORNOW: A simple evolution of an induction variables in the loop is
 713    considered a polynomial evolution.  */
 714
 715 static bool
 716 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 717                              tree * step)
 718 {
 719   tree init_expr;
 720   tree step_expr;
 721   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 722   basic_block bb;
 723
 724   /* When there is no evolution in this loop, the evolution function
 725      is not "simple".  */
 726   if (evolution_part == NULL_TREE)
 727     return false;
 728
 729   /* When the evolution is a polynomial of degree >= 2
 730      the evolution function is not "simple".  */
 731   if (tree_is_chrec (evolution_part))
 732     return false;
 733
 734   step_expr = evolution_part;
 735   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 736
 737   if (dump_enabled_p ())
 738     {
 739       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 740       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 741       dump_printf (MSG_NOTE, ",  init: ");
 742       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 743       dump_printf (MSG_NOTE, "\n");
 744     }
 745
 746   *init = init_expr;
 747   *step = step_expr;
 748
 749   if (TREE_CODE (step_expr) != INTEGER_CST
 750       && (TREE_CODE (step_expr) != SSA_NAME
 751           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 752               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 753           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 754               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 755                   || !flag_associative_math)))
 756       && (TREE_CODE (step_expr) != REAL_CST
 757           || !flag_associative_math))
 758     {
 759       if (dump_enabled_p ())
 760         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 761                          "step unknown.\n");
 762       return false;
 763     }
 764
 765   return true;
 766 }
 767
 768 /* Function vect_analyze_scalar_cycles_1.
 769
 770    Examine the cross iteration def-use cycles of scalar variables
 771    in LOOP.  LOOP_VINFO represents the loop that is now being
 772    considered for vectorization (can be LOOP, or an outer-loop
 773    enclosing LOOP).  */
 774
 775 static void
 776 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 777 {
 778   basic_block bb = loop->header;
 779   tree init, step;
 780   auto_vec<gimple *, 64> worklist;
 781   gphi_iterator gsi;
 782   bool double_reduc;
 783
 784   if (dump_enabled_p ())
 785     dump_printf_loc (MSG_NOTE, vect_location,
 786                      "=== vect_analyze_scalar_cycles ===\n");
 787
 788   /* First - identify all inductions.  Reduction detection assumes that all the
 789      inductions have been identified, therefore, this order must not be
 790      changed.  */
 791   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 792     {
 793       gphi *phi = gsi.phi ();
 794       tree access_fn = NULL;
 795       tree def = PHI_RESULT (phi);
 796       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 797
 798       if (dump_enabled_p ())
 799         {
 800           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 801           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 802         }
 803
 804       /* Skip virtual phi's.  The data dependences that are associated with
 805          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 806       if (virtual_operand_p (def))
 807         continue;
 808
 809       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 810
 811       /* Analyze the evolution function.  */
 812       access_fn = analyze_scalar_evolution (loop, def);
 813       if (access_fn)
 814         {
 815           STRIP_NOPS (access_fn);
 816           if (dump_enabled_p ())
 817             {
 818               dump_printf_loc (MSG_NOTE, vect_location,
 819                                "Access function of PHI: ");
 820               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 821               dump_printf (MSG_NOTE, "\n");
 822             }
 823           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 824             = initial_condition_in_loop_num (access_fn, loop->num);
 825           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 826             = evolution_part_in_loop_num (access_fn, loop->num);
 827         }
 828
 829       if (!access_fn
 830           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 831           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 832               && TREE_CODE (step) != INTEGER_CST))
 833         {
 834           worklist.safe_push (phi);
 835           continue;
 836         }
 837
 838       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 839                   != NULL_TREE);
 840       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 841
 842       if (dump_enabled_p ())
 843         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 844       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 845     }
 846
 847
 848   /* Second - identify all reductions and nested cycles.  */
 849   while (worklist.length () > 0)
 850     {
 851       gimple *phi = worklist.pop ();
 852       tree def = PHI_RESULT (phi);
 853       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 854       gimple *reduc_stmt;
 855
 856       if (dump_enabled_p ())
 857         {
 858           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 859           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 860         }
 861
 862       gcc_assert (!virtual_operand_p (def)
 863                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 864
 865       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 866                                                 &double_reduc, false);
 867       if (reduc_stmt)
 868         {
 869           if (double_reduc)
 870             {
 871               if (dump_enabled_p ())
 872                 dump_printf_loc (MSG_NOTE, vect_location,
 873                                  "Detected double reduction.\n");
 874
 875               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 876               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 877                                                     vect_double_reduction_def;
 878             }
 879           else
 880             {
 881               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 882                 {
 883                   if (dump_enabled_p ())
 884                     dump_printf_loc (MSG_NOTE, vect_location,
 885                                      "Detected vectorizable nested cycle.\n");
 886
 887                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 888                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 889                                                              vect_nested_cycle;
 890                 }
 891               else
 892                 {
 893                   if (dump_enabled_p ())
 894                     dump_printf_loc (MSG_NOTE, vect_location,
 895                                      "Detected reduction.\n");
 896
 897                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 898                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 899                                                            vect_reduction_def;
 900                   /* Store the reduction cycles for possible vectorization in
 901                      loop-aware SLP if it was not detected as reduction
 902                      chain.  */
 903                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 904                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 905                 }
 906             }
 907         }
 908       else
 909         if (dump_enabled_p ())
 910           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 911                            "Unknown def-use cycle pattern.\n");
 912     }
 913 }
 914
 915
 916 /* Function vect_analyze_scalar_cycles.
 917
 918    Examine the cross iteration def-use cycles of scalar variables, by
 919    analyzing the loop-header PHIs of scalar variables.  Classify each
 920    cycle as one of the following: invariant, induction, reduction, unknown.
 921    We do that for the loop represented by LOOP_VINFO, and also to its
 922    inner-loop, if exists.
 923    Examples for scalar cycles:
 924
 925    Example1: reduction:
 926
 927               loop1:
 928               for (i=0; i<N; i++)
 929                  sum += a[i];
 930
 931    Example2: induction:
 932
 933               loop2:
 934               for (i=0; i<N; i++)
 935                  a[i] = i;  */
 936
 937 static void
 938 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 939 {
 940   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 941
 942   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 943
 944   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 945      Reductions in such inner-loop therefore have different properties than
 946      the reductions in the nest that gets vectorized:
 947      1. When vectorized, they are executed in the same order as in the original
 948         scalar loop, so we can't change the order of computation when
 949         vectorizing them.
 950      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 951         current checks are too strict.  */
 952
 953   if (loop->inner)
 954     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 955 }
 956
 957 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 958
 959 static void
 960 vect_fixup_reduc_chain (gimple *stmt)
 961 {
 962   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 963   gimple *stmtp;
 964   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 965               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 966   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 967   do
 968     {
 969       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 970       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 971       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 972       if (stmt)
 973         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 974           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 975     }
 976   while (stmt);
 977   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 978 }
 979
 980 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 981
 982 static void
 983 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 984 {
 985   gimple *first;
 986   unsigned i;
 987
 988   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 989     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 990       {
 991         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 992         while (next)
 993           {
 994             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 995               break;
 996             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 997           }
 998         /* If not all stmt in the chain are patterns try to handle
 999            the chain without patterns.  */
1000         if (! next)
1001           {
1002             vect_fixup_reduc_chain (first);
1003             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1004               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1005           }
1006       }
1007 }
1008
1009 /* Function vect_get_loop_niters.
1010
1011    Determine how many iterations the loop is executed and place it
1012    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1013    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1014    niter information holds in ASSUMPTIONS.
1015
1016    Return the loop exit condition.  */
1017
1018
1019 static gcond *
1020 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1021                       tree *number_of_iterations, tree *number_of_iterationsm1)
1022 {
1023   edge exit = single_exit (loop);
1024   struct tree_niter_desc niter_desc;
1025   tree niter_assumptions, niter, may_be_zero;
1026   gcond *cond = get_loop_exit_condition (loop);
1027
1028   *assumptions = boolean_true_node;
1029   *number_of_iterationsm1 = chrec_dont_know;
1030   *number_of_iterations = chrec_dont_know;
1031   if (dump_enabled_p ())
1032     dump_printf_loc (MSG_NOTE, vect_location,
1033                      "=== get_loop_niters ===\n");
1034
1035   if (!exit)
1036     return cond;
1037
1038   niter = chrec_dont_know;
1039   may_be_zero = NULL_TREE;
1040   niter_assumptions = boolean_true_node;
1041   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1042       || chrec_contains_undetermined (niter_desc.niter))
1043     return cond;
1044
1045   niter_assumptions = niter_desc.assumptions;
1046   may_be_zero = niter_desc.may_be_zero;
1047   niter = niter_desc.niter;
1048
1049   if (may_be_zero && integer_zerop (may_be_zero))
1050     may_be_zero = NULL_TREE;
1051
1052   if (may_be_zero)
1053     {
1054       if (COMPARISON_CLASS_P (may_be_zero))
1055         {
1056           /* Try to combine may_be_zero with assumptions, this can simplify
1057              computation of niter expression.  */
1058           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1059             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1060                                              niter_assumptions,
1061                                              fold_build1 (TRUTH_NOT_EXPR,
1062                                                           boolean_type_node,
1063                                                           may_be_zero));
1064           else
1065             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1066                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1067
1068           may_be_zero = NULL_TREE;
1069         }
1070       else if (integer_nonzerop (may_be_zero))
1071         {
1072           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1073           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1074           return cond;
1075         }
1076       else
1077         return cond;
1078     }
1079
1080   *assumptions = niter_assumptions;
1081   *number_of_iterationsm1 = niter;
1082
1083   /* We want the number of loop header executions which is the number
1084      of latch executions plus one.
1085      ???  For UINT_MAX latch executions this number overflows to zero
1086      for loops like do { n++; } while (n != 0);  */
1087   if (niter && !chrec_contains_undetermined (niter))
1088     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1089                           build_int_cst (TREE_TYPE (niter), 1));
1090   *number_of_iterations = niter;
1091
1092   return cond;
1093 }
1094
1095 /* Function bb_in_loop_p
1096
1097    Used as predicate for dfs order traversal of the loop bbs.  */
1098
1099 static bool
1100 bb_in_loop_p (const_basic_block bb, const void *data)
1101 {
1102   const struct loop *const loop = (const struct loop *)data;
1103   if (flow_bb_inside_loop_p (loop, bb))
1104     return true;
1105   return false;
1106 }
1107
1108
1109 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1110    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1111
1112 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1113   : vec_info (vec_info::loop, init_cost (loop_in)),
1114     loop (loop_in),
1115     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1116     num_itersm1 (NULL_TREE),
1117     num_iters (NULL_TREE),
1118     num_iters_unchanged (NULL_TREE),
1119     num_iters_assumptions (NULL_TREE),
1120     th (0),
1121     versioning_threshold (0),
1122     vectorization_factor (0),
1123     max_vectorization_factor (0),
1124     mask_skip_niters (NULL_TREE),
1125     mask_compare_type (NULL_TREE),
1126     unaligned_dr (NULL),
1127     peeling_for_alignment (0),
1128     ptr_mask (0),
1129     slp_unrolling_factor (1),
1130     single_scalar_iteration_cost (0),
1131     vectorizable (false),
1132     can_fully_mask_p (true),
1133     fully_masked_p (false),
1134     peeling_for_gaps (false),
1135     peeling_for_niter (false),
1136     operands_swapped (false),
1137     no_data_dependencies (false),
1138     has_mask_store (false),
1139     scalar_loop (NULL),
1140     orig_loop_info (NULL)
1141 {
1142   /* Create/Update stmt_info for all stmts in the loop.  */
1143   basic_block *body = get_loop_body (loop);
1144   for (unsigned int i = 0; i < loop->num_nodes; i++)
1145     {
1146       basic_block bb = body[i];
1147       gimple_stmt_iterator si;
1148
1149       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1150         {
1151           gimple *phi = gsi_stmt (si);
1152           gimple_set_uid (phi, 0);
1153           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1154         }
1155
1156       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1157         {
1158           gimple *stmt = gsi_stmt (si);
1159           gimple_set_uid (stmt, 0);
1160           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1161         }
1162     }
1163   free (body);
1164
1165   /* CHECKME: We want to visit all BBs before their successors (except for
1166      latch blocks, for which this assertion wouldn't hold).  In the simple
1167      case of the loop forms we allow, a dfs order of the BBs would the same
1168      as reversed postorder traversal, so we are safe.  */
1169
1170   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1171                                           bbs, loop->num_nodes, loop);
1172   gcc_assert (nbbs == loop->num_nodes);
1173 }
1174
1175 /* Free all levels of MASKS.  */
1176
1177 void
1178 release_vec_loop_masks (vec_loop_masks *masks)
1179 {
1180   rgroup_masks *rgm;
1181   unsigned int i;
1182   FOR_EACH_VEC_ELT (*masks, i, rgm)
1183     rgm->masks.release ();
1184   masks->release ();
1185 }
1186
1187 /* Free all memory used by the _loop_vec_info, as well as all the
1188    stmt_vec_info structs of all the stmts in the loop.  */
1189
1190 _loop_vec_info::~_loop_vec_info ()
1191 {
1192   int nbbs;
1193   gimple_stmt_iterator si;
1194   int j;
1195
1196   nbbs = loop->num_nodes;
1197   for (j = 0; j < nbbs; j++)
1198     {
1199       basic_block bb = bbs[j];
1200       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1201         free_stmt_vec_info (gsi_stmt (si));
1202
1203       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1204         {
1205           gimple *stmt = gsi_stmt (si);
1206
1207           /* We may have broken canonical form by moving a constant
1208              into RHS1 of a commutative op.  Fix such occurrences.  */
1209           if (operands_swapped && is_gimple_assign (stmt))
1210             {
1211               enum tree_code code = gimple_assign_rhs_code (stmt);
1212
1213               if ((code == PLUS_EXPR
1214                    || code == POINTER_PLUS_EXPR
1215                    || code == MULT_EXPR)
1216                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1217                 swap_ssa_operands (stmt,
1218                                    gimple_assign_rhs1_ptr (stmt),
1219                                    gimple_assign_rhs2_ptr (stmt));
1220               else if (code == COND_EXPR
1221                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1222                 {
1223                   tree cond_expr = gimple_assign_rhs1 (stmt);
1224                   enum tree_code cond_code = TREE_CODE (cond_expr);
1225
1226                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1227                     {
1228                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1229                                                                   0));
1230                       cond_code = invert_tree_comparison (cond_code,
1231                                                           honor_nans);
1232                       if (cond_code != ERROR_MARK)
1233                         {
1234                           TREE_SET_CODE (cond_expr, cond_code);
1235                           swap_ssa_operands (stmt,
1236                                              gimple_assign_rhs2_ptr (stmt),
1237                                              gimple_assign_rhs3_ptr (stmt));
1238                         }
1239                     }
1240                 }
1241             }
1242
1243           /* Free stmt_vec_info.  */
1244           free_stmt_vec_info (stmt);
1245           gsi_next (&si);
1246         }
1247     }
1248
1249   free (bbs);
1250
1251   release_vec_loop_masks (&masks);
1252
1253   loop->aux = NULL;
1254 }
1255
1256 /* Return true if we can use CMP_TYPE as the comparison type to produce
1257    all masks required to mask LOOP_VINFO.  */
1258
1259 static bool
1260 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1261 {
1262   rgroup_masks *rgm;
1263   unsigned int i;
1264   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1265     if (rgm->mask_type != NULL_TREE
1266         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1267                                             cmp_type, rgm->mask_type,
1268                                             OPTIMIZE_FOR_SPEED))
1269       return false;
1270   return true;
1271 }
1272
1273 /* Calculate the maximum number of scalars per iteration for every
1274    rgroup in LOOP_VINFO.  */
1275
1276 static unsigned int
1277 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1278 {
1279   unsigned int res = 1;
1280   unsigned int i;
1281   rgroup_masks *rgm;
1282   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1283     res = MAX (res, rgm->max_nscalars_per_iter);
1284   return res;
1285 }
1286
1287 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1288    whether we can actually generate the masks required.  Return true if so,
1289    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1290
1291 static bool
1292 vect_verify_full_masking (loop_vec_info loop_vinfo)
1293 {
1294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1295   unsigned int min_ni_width;
1296
1297   /* Get the maximum number of iterations that is representable
1298      in the counter type.  */
1299   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1300   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1301
1302   /* Get a more refined estimate for the number of iterations.  */
1303   widest_int max_back_edges;
1304   if (max_loop_iterations (loop, &max_back_edges))
1305     max_ni = wi::smin (max_ni, max_back_edges + 1);
1306
1307   /* Account for rgroup masks, in which each bit is replicated N times.  */
1308   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1309
1310   /* Work out how many bits we need to represent the limit.  */
1311   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1312
1313   /* Find a scalar mode for which WHILE_ULT is supported.  */
1314   opt_scalar_int_mode cmp_mode_iter;
1315   tree cmp_type = NULL_TREE;
1316   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1317     {
1318       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1319       if (cmp_bits >= min_ni_width
1320           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1321         {
1322           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1323           if (this_type
1324               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1325             {
1326               /* Although we could stop as soon as we find a valid mode,
1327                  it's often better to continue until we hit Pmode, since the
1328                  operands to the WHILE are more likely to be reusable in
1329                  address calculations.  */
1330               cmp_type = this_type;
1331               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1332                 break;
1333             }
1334         }
1335     }
1336
1337   if (!cmp_type)
1338     return false;
1339
1340   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1341   return true;
1342 }
1343
1344 /* Calculate the cost of one scalar iteration of the loop.  */
1345 static void
1346 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1347 {
1348   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1349   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1350   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1351   int innerloop_iters, i;
1352
1353   /* Count statements in scalar loop.  Using this as scalar cost for a single
1354      iteration for now.
1355
1356      TODO: Add outer loop support.
1357
1358      TODO: Consider assigning different costs to different scalar
1359      statements.  */
1360
1361   /* FORNOW.  */
1362   innerloop_iters = 1;
1363   if (loop->inner)
1364     innerloop_iters = 50; /* FIXME */
1365
1366   for (i = 0; i < nbbs; i++)
1367     {
1368       gimple_stmt_iterator si;
1369       basic_block bb = bbs[i];
1370
1371       if (bb->loop_father == loop->inner)
1372         factor = innerloop_iters;
1373       else
1374         factor = 1;
1375
1376       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1377         {
1378           gimple *stmt = gsi_stmt (si);
1379           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1380
1381           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1382             continue;
1383
1384           /* Skip stmts that are not vectorized inside the loop.  */
1385           if (stmt_info
1386               && !STMT_VINFO_RELEVANT_P (stmt_info)
1387               && (!STMT_VINFO_LIVE_P (stmt_info)
1388                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1389               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1390             continue;
1391
1392           vect_cost_for_stmt kind;
1393           if (STMT_VINFO_DATA_REF (stmt_info))
1394             {
1395               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1396                kind = scalar_load;
1397              else
1398                kind = scalar_store;
1399             }
1400           else
1401             kind = scalar_stmt;
1402
1403           scalar_single_iter_cost
1404             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1405                                  factor, kind, stmt_info, 0, vect_prologue);
1406         }
1407     }
1408   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1409     = scalar_single_iter_cost;
1410 }
1411
1412
1413 /* Function vect_analyze_loop_form_1.
1414
1415    Verify that certain CFG restrictions hold, including:
1416    - the loop has a pre-header
1417    - the loop has a single entry and exit
1418    - the loop exit condition is simple enough
1419    - the number of iterations can be analyzed, i.e, a countable loop.  The
1420      niter could be analyzed under some assumptions.  */
1421
1422 bool
1423 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1424                           tree *assumptions, tree *number_of_iterationsm1,
1425                           tree *number_of_iterations, gcond **inner_loop_cond)
1426 {
1427   if (dump_enabled_p ())
1428     dump_printf_loc (MSG_NOTE, vect_location,
1429                      "=== vect_analyze_loop_form ===\n");
1430
1431   /* Different restrictions apply when we are considering an inner-most loop,
1432      vs. an outer (nested) loop.
1433      (FORNOW. May want to relax some of these restrictions in the future).  */
1434
1435   if (!loop->inner)
1436     {
1437       /* Inner-most loop.  We currently require that the number of BBs is
1438          exactly 2 (the header and latch).  Vectorizable inner-most loops
1439          look like this:
1440
1441                         (pre-header)
1442                            |
1443                           header <--------+
1444                            | |            |
1445                            | +--> latch --+
1446                            |
1447                         (exit-bb)  */
1448
1449       if (loop->num_nodes != 2)
1450         {
1451           if (dump_enabled_p ())
1452             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1453                              "not vectorized: control flow in loop.\n");
1454           return false;
1455         }
1456
1457       if (empty_block_p (loop->header))
1458         {
1459           if (dump_enabled_p ())
1460             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461                              "not vectorized: empty loop.\n");
1462           return false;
1463         }
1464     }
1465   else
1466     {
1467       struct loop *innerloop = loop->inner;
1468       edge entryedge;
1469
1470       /* Nested loop. We currently require that the loop is doubly-nested,
1471          contains a single inner loop, and the number of BBs is exactly 5.
1472          Vectorizable outer-loops look like this:
1473
1474                         (pre-header)
1475                            |
1476                           header <---+
1477                            |         |
1478                           inner-loop |
1479                            |         |
1480                           tail ------+
1481                            |
1482                         (exit-bb)
1483
1484          The inner-loop has the properties expected of inner-most loops
1485          as described above.  */
1486
1487       if ((loop->inner)->inner || (loop->inner)->next)
1488         {
1489           if (dump_enabled_p ())
1490             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1491                              "not vectorized: multiple nested loops.\n");
1492           return false;
1493         }
1494
1495       if (loop->num_nodes != 5)
1496         {
1497           if (dump_enabled_p ())
1498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499                              "not vectorized: control flow in loop.\n");
1500           return false;
1501         }
1502
1503       entryedge = loop_preheader_edge (innerloop);
1504       if (entryedge->src != loop->header
1505           || !single_exit (innerloop)
1506           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1507         {
1508           if (dump_enabled_p ())
1509             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1510                              "not vectorized: unsupported outerloop form.\n");
1511           return false;
1512         }
1513
1514       /* Analyze the inner-loop.  */
1515       tree inner_niterm1, inner_niter, inner_assumptions;
1516       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1517                                       &inner_assumptions, &inner_niterm1,
1518                                       &inner_niter, NULL)
1519           /* Don't support analyzing niter under assumptions for inner
1520              loop.  */
1521           || !integer_onep (inner_assumptions))
1522         {
1523           if (dump_enabled_p ())
1524             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1525                              "not vectorized: Bad inner loop.\n");
1526           return false;
1527         }
1528
1529       if (!expr_invariant_in_loop_p (loop, inner_niter))
1530         {
1531           if (dump_enabled_p ())
1532             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                              "not vectorized: inner-loop count not"
1534                              " invariant.\n");
1535           return false;
1536         }
1537
1538       if (dump_enabled_p ())
1539         dump_printf_loc (MSG_NOTE, vect_location,
1540                          "Considering outer-loop vectorization.\n");
1541     }
1542
1543   if (!single_exit (loop)
1544       || EDGE_COUNT (loop->header->preds) != 2)
1545     {
1546       if (dump_enabled_p ())
1547         {
1548           if (!single_exit (loop))
1549             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1550                              "not vectorized: multiple exits.\n");
1551           else if (EDGE_COUNT (loop->header->preds) != 2)
1552             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553                              "not vectorized: too many incoming edges.\n");
1554         }
1555       return false;
1556     }
1557
1558   /* We assume that the loop exit condition is at the end of the loop. i.e,
1559      that the loop is represented as a do-while (with a proper if-guard
1560      before the loop if needed), where the loop header contains all the
1561      executable statements, and the latch is empty.  */
1562   if (!empty_block_p (loop->latch)
1563       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1564     {
1565       if (dump_enabled_p ())
1566         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1567                          "not vectorized: latch block not empty.\n");
1568       return false;
1569     }
1570
1571   /* Make sure the exit is not abnormal.  */
1572   edge e = single_exit (loop);
1573   if (e->flags & EDGE_ABNORMAL)
1574     {
1575       if (dump_enabled_p ())
1576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577                          "not vectorized: abnormal loop exit edge.\n");
1578       return false;
1579     }
1580
1581   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1582                                      number_of_iterationsm1);
1583   if (!*loop_cond)
1584     {
1585       if (dump_enabled_p ())
1586         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1587                          "not vectorized: complicated exit condition.\n");
1588       return false;
1589     }
1590
1591   if (integer_zerop (*assumptions)
1592       || !*number_of_iterations
1593       || chrec_contains_undetermined (*number_of_iterations))
1594     {
1595       if (dump_enabled_p ())
1596         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1597                          "not vectorized: number of iterations cannot be "
1598                          "computed.\n");
1599       return false;
1600     }
1601
1602   if (integer_zerop (*number_of_iterations))
1603     {
1604       if (dump_enabled_p ())
1605         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606                          "not vectorized: number of iterations = 0.\n");
1607       return false;
1608     }
1609
1610   return true;
1611 }
1612
1613 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1614
1615 loop_vec_info
1616 vect_analyze_loop_form (struct loop *loop)
1617 {
1618   tree assumptions, number_of_iterations, number_of_iterationsm1;
1619   gcond *loop_cond, *inner_loop_cond = NULL;
1620
1621   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1622                                   &assumptions, &number_of_iterationsm1,
1623                                   &number_of_iterations, &inner_loop_cond))
1624     return NULL;
1625
1626   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1627   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1628   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1629   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1630   if (!integer_onep (assumptions))
1631     {
1632       /* We consider to vectorize this loop by versioning it under
1633          some assumptions.  In order to do this, we need to clear
1634          existing information computed by scev and niter analyzer.  */
1635       scev_reset_htab ();
1636       free_numbers_of_iterations_estimates (loop);
1637       /* Also set flag for this loop so that following scev and niter
1638          analysis are done under the assumptions.  */
1639       loop_constraint_set (loop, LOOP_C_FINITE);
1640       /* Also record the assumptions for versioning.  */
1641       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1642     }
1643
1644   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1645     {
1646       if (dump_enabled_p ())
1647         {
1648           dump_printf_loc (MSG_NOTE, vect_location,
1649                            "Symbolic number of iterations is ");
1650           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1651           dump_printf (MSG_NOTE, "\n");
1652         }
1653     }
1654
1655   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1656   if (inner_loop_cond)
1657     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1658       = loop_exit_ctrl_vec_info_type;
1659
1660   gcc_assert (!loop->aux);
1661   loop->aux = loop_vinfo;
1662   return loop_vinfo;
1663 }
1664
1665
1666
1667 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1668    statements update the vectorization factor.  */
1669
1670 static void
1671 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1672 {
1673   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1675   int nbbs = loop->num_nodes;
1676   poly_uint64 vectorization_factor;
1677   int i;
1678
1679   if (dump_enabled_p ())
1680     dump_printf_loc (MSG_NOTE, vect_location,
1681                      "=== vect_update_vf_for_slp ===\n");
1682
1683   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1684   gcc_assert (known_ne (vectorization_factor, 0U));
1685
1686   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1687      vectorization factor of the loop is the unrolling factor required by
1688      the SLP instances.  If that unrolling factor is 1, we say, that we
1689      perform pure SLP on loop - cross iteration parallelism is not
1690      exploited.  */
1691   bool only_slp_in_loop = true;
1692   for (i = 0; i < nbbs; i++)
1693     {
1694       basic_block bb = bbs[i];
1695       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1696            gsi_next (&si))
1697         {
1698           gimple *stmt = gsi_stmt (si);
1699           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1700           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1701               && STMT_VINFO_RELATED_STMT (stmt_info))
1702             {
1703               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1704               stmt_info = vinfo_for_stmt (stmt);
1705             }
1706           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1707                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1708               && !PURE_SLP_STMT (stmt_info))
1709             /* STMT needs both SLP and loop-based vectorization.  */
1710             only_slp_in_loop = false;
1711         }
1712     }
1713
1714   if (only_slp_in_loop)
1715     {
1716       dump_printf_loc (MSG_NOTE, vect_location,
1717                        "Loop contains only SLP stmts\n");
1718       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1719     }
1720   else
1721     {
1722       dump_printf_loc (MSG_NOTE, vect_location,
1723                        "Loop contains SLP and non-SLP stmts\n");
1724       /* Both the vectorization factor and unroll factor have the form
1725          current_vector_size * X for some rational X, so they must have
1726          a common multiple.  */
1727       vectorization_factor
1728         = force_common_multiple (vectorization_factor,
1729                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1730     }
1731
1732   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1733   if (dump_enabled_p ())
1734     {
1735       dump_printf_loc (MSG_NOTE, vect_location,
1736                        "Updating vectorization factor to ");
1737       dump_dec (MSG_NOTE, vectorization_factor);
1738       dump_printf (MSG_NOTE, ".\n");
1739     }
1740 }
1741
1742 /* Function vect_analyze_loop_operations.
1743
1744    Scan the loop stmts and make sure they are all vectorizable.  */
1745
1746 static bool
1747 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1748 {
1749   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1750   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1751   int nbbs = loop->num_nodes;
1752   int i;
1753   stmt_vec_info stmt_info;
1754   bool need_to_vectorize = false;
1755   bool ok;
1756
1757   if (dump_enabled_p ())
1758     dump_printf_loc (MSG_NOTE, vect_location,
1759                      "=== vect_analyze_loop_operations ===\n");
1760
1761   for (i = 0; i < nbbs; i++)
1762     {
1763       basic_block bb = bbs[i];
1764
1765       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1766            gsi_next (&si))
1767         {
1768           gphi *phi = si.phi ();
1769           ok = true;
1770
1771           stmt_info = vinfo_for_stmt (phi);
1772           if (dump_enabled_p ())
1773             {
1774               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1775               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1776             }
1777           if (virtual_operand_p (gimple_phi_result (phi)))
1778             continue;
1779
1780           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1781              (i.e., a phi in the tail of the outer-loop).  */
1782           if (! is_loop_header_bb_p (bb))
1783             {
1784               /* FORNOW: we currently don't support the case that these phis
1785                  are not used in the outerloop (unless it is double reduction,
1786                  i.e., this phi is vect_reduction_def), cause this case
1787                  requires to actually do something here.  */
1788               if (STMT_VINFO_LIVE_P (stmt_info)
1789                   && STMT_VINFO_DEF_TYPE (stmt_info)
1790                      != vect_double_reduction_def)
1791                 {
1792                   if (dump_enabled_p ())
1793                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1794                                      "Unsupported loop-closed phi in "
1795                                      "outer-loop.\n");
1796                   return false;
1797                 }
1798
1799               /* If PHI is used in the outer loop, we check that its operand
1800                  is defined in the inner loop.  */
1801               if (STMT_VINFO_RELEVANT_P (stmt_info))
1802                 {
1803                   tree phi_op;
1804                   gimple *op_def_stmt;
1805
1806                   if (gimple_phi_num_args (phi) != 1)
1807                     return false;
1808
1809                   phi_op = PHI_ARG_DEF (phi, 0);
1810                   if (TREE_CODE (phi_op) != SSA_NAME)
1811                     return false;
1812
1813                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1814                   if (gimple_nop_p (op_def_stmt)
1815                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1816                       || !vinfo_for_stmt (op_def_stmt))
1817                     return false;
1818
1819                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1820                         != vect_used_in_outer
1821                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1822                            != vect_used_in_outer_by_reduction)
1823                     return false;
1824                 }
1825
1826               continue;
1827             }
1828
1829           gcc_assert (stmt_info);
1830
1831           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1832                || STMT_VINFO_LIVE_P (stmt_info))
1833               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1834             {
1835               /* A scalar-dependence cycle that we don't support.  */
1836               if (dump_enabled_p ())
1837                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838                                  "not vectorized: scalar dependence cycle.\n");
1839               return false;
1840             }
1841
1842           if (STMT_VINFO_RELEVANT_P (stmt_info))
1843             {
1844               need_to_vectorize = true;
1845               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1846                   && ! PURE_SLP_STMT (stmt_info))
1847                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1848               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1849                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1850                        && ! PURE_SLP_STMT (stmt_info))
1851                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1852             }
1853
1854           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1855           if (ok
1856               && STMT_VINFO_LIVE_P (stmt_info)
1857               && !PURE_SLP_STMT (stmt_info))
1858             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1859
1860           if (!ok)
1861             {
1862               if (dump_enabled_p ())
1863                 {
1864                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865                                    "not vectorized: relevant phi not "
1866                                    "supported: ");
1867                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1868                 }
1869               return false;
1870             }
1871         }
1872
1873       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1874            gsi_next (&si))
1875         {
1876           gimple *stmt = gsi_stmt (si);
1877           if (!gimple_clobber_p (stmt)
1878               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1879             return false;
1880         }
1881     } /* bbs */
1882
1883   /* All operations in the loop are either irrelevant (deal with loop
1884      control, or dead), or only used outside the loop and can be moved
1885      out of the loop (e.g. invariants, inductions).  The loop can be
1886      optimized away by scalar optimizations.  We're better off not
1887      touching this loop.  */
1888   if (!need_to_vectorize)
1889     {
1890       if (dump_enabled_p ())
1891         dump_printf_loc (MSG_NOTE, vect_location,
1892                          "All the computation can be taken out of the loop.\n");
1893       if (dump_enabled_p ())
1894         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895                          "not vectorized: redundant loop. no profit to "
1896                          "vectorize.\n");
1897       return false;
1898     }
1899
1900   return true;
1901 }
1902
1903 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1904    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1905    definitely no, or -1 if it's worth retrying.  */
1906
1907 static int
1908 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1909 {
1910   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1911   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1912
1913   /* Only fully-masked loops can have iteration counts less than the
1914      vectorization factor.  */
1915   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1916     {
1917       HOST_WIDE_INT max_niter;
1918
1919       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1920         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1921       else
1922         max_niter = max_stmt_executions_int (loop);
1923
1924       if (max_niter != -1
1925           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1926         {
1927           if (dump_enabled_p ())
1928             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                              "not vectorized: iteration count smaller than "
1930                              "vectorization factor.\n");
1931           return 0;
1932         }
1933     }
1934
1935   int min_profitable_iters, min_profitable_estimate;
1936   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1937                                       &min_profitable_estimate);
1938
1939   if (min_profitable_iters < 0)
1940     {
1941       if (dump_enabled_p ())
1942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943                          "not vectorized: vectorization not profitable.\n");
1944       if (dump_enabled_p ())
1945         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1946                          "not vectorized: vector version will never be "
1947                          "profitable.\n");
1948       return -1;
1949     }
1950
1951   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1952                                * assumed_vf);
1953
1954   /* Use the cost model only if it is more conservative than user specified
1955      threshold.  */
1956   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1957                                     min_profitable_iters);
1958
1959   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1960
1961   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1962       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1963     {
1964       if (dump_enabled_p ())
1965         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1966                          "not vectorized: vectorization not profitable.\n");
1967       if (dump_enabled_p ())
1968         dump_printf_loc (MSG_NOTE, vect_location,
1969                          "not vectorized: iteration count smaller than user "
1970                          "specified loop bound parameter or minimum profitable "
1971                          "iterations (whichever is more conservative).\n");
1972       return 0;
1973     }
1974
1975   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1976   if (estimated_niter == -1)
1977     estimated_niter = likely_max_stmt_executions_int (loop);
1978   if (estimated_niter != -1
1979       && ((unsigned HOST_WIDE_INT) estimated_niter
1980           < MAX (th, (unsigned) min_profitable_estimate)))
1981     {
1982       if (dump_enabled_p ())
1983         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1984                          "not vectorized: estimated iteration count too "
1985                          "small.\n");
1986       if (dump_enabled_p ())
1987         dump_printf_loc (MSG_NOTE, vect_location,
1988                          "not vectorized: estimated iteration count smaller "
1989                          "than specified loop bound parameter or minimum "
1990                          "profitable iterations (whichever is more "
1991                          "conservative).\n");
1992       return -1;
1993     }
1994
1995   return 1;
1996 }
1997
1998
1999 /* Function vect_analyze_loop_2.
2000
2001    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2002    for it.  The different analyses will record information in the
2003    loop_vec_info struct.  */
2004 static bool
2005 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2006 {
2007   bool ok;
2008   int res;
2009   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2010   poly_uint64 min_vf = 2;
2011   unsigned int n_stmts = 0;
2012
2013   /* The first group of checks is independent of the vector size.  */
2014   fatal = true;
2015
2016   /* Find all data references in the loop (which correspond to vdefs/vuses)
2017      and analyze their evolution in the loop.  */
2018
2019   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2020
2021   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2022   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2023     {
2024       if (dump_enabled_p ())
2025         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026                          "not vectorized: loop nest containing two "
2027                          "or more consecutive inner loops cannot be "
2028                          "vectorized\n");
2029       return false;
2030     }
2031
2032   for (unsigned i = 0; i < loop->num_nodes; i++)
2033     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2034          !gsi_end_p (gsi); gsi_next (&gsi))
2035       {
2036         gimple *stmt = gsi_stmt (gsi);
2037         if (is_gimple_debug (stmt))
2038           continue;
2039         ++n_stmts;
2040         if (!find_data_references_in_stmt (loop, stmt,
2041                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
2042           {
2043             if (is_gimple_call (stmt) && loop->safelen)
2044               {
2045                 tree fndecl = gimple_call_fndecl (stmt), op;
2046                 if (fndecl != NULL_TREE)
2047                   {
2048                     cgraph_node *node = cgraph_node::get (fndecl);
2049                     if (node != NULL && node->simd_clones != NULL)
2050                       {
2051                         unsigned int j, n = gimple_call_num_args (stmt);
2052                         for (j = 0; j < n; j++)
2053                           {
2054                             op = gimple_call_arg (stmt, j);
2055                             if (DECL_P (op)
2056                                 || (REFERENCE_CLASS_P (op)
2057                                     && get_base_address (op)))
2058                               break;
2059                           }
2060                         op = gimple_call_lhs (stmt);
2061                         /* Ignore #pragma omp declare simd functions
2062                            if they don't have data references in the
2063                            call stmt itself.  */
2064                         if (j == n
2065                             && !(op
2066                                  && (DECL_P (op)
2067                                      || (REFERENCE_CLASS_P (op)
2068                                          && get_base_address (op)))))
2069                           continue;
2070                       }
2071                   }
2072               }
2073             if (dump_enabled_p ())
2074               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2075                                "not vectorized: loop contains function "
2076                                "calls or data references that cannot "
2077                                "be analyzed\n");
2078             return false;
2079           }
2080       }
2081
2082   /* Analyze the data references and also adjust the minimal
2083      vectorization factor according to the loads and stores.  */
2084
2085   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2086   if (!ok)
2087     {
2088       if (dump_enabled_p ())
2089         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2090                          "bad data references.\n");
2091       return false;
2092     }
2093
2094   /* Classify all cross-iteration scalar data-flow cycles.
2095      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2096   vect_analyze_scalar_cycles (loop_vinfo);
2097
2098   vect_pattern_recog (loop_vinfo);
2099
2100   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2101
2102   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2103      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2104
2105   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2106   if (!ok)
2107     {
2108       if (dump_enabled_p ())
2109         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2110                          "bad data access.\n");
2111       return false;
2112     }
2113
2114   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2115
2116   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2117   if (!ok)
2118     {
2119       if (dump_enabled_p ())
2120         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2121                          "unexpected pattern.\n");
2122       return false;
2123     }
2124
2125   /* While the rest of the analysis below depends on it in some way.  */
2126   fatal = false;
2127
2128   /* Analyze data dependences between the data-refs in the loop
2129      and adjust the maximum vectorization factor according to
2130      the dependences.
2131      FORNOW: fail at the first data dependence that we encounter.  */
2132
2133   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2134   if (!ok
2135       || (max_vf != MAX_VECTORIZATION_FACTOR
2136           && maybe_lt (max_vf, min_vf)))
2137     {
2138       if (dump_enabled_p ())
2139             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2140                              "bad data dependence.\n");
2141       return false;
2142     }
2143   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2144
2145   ok = vect_determine_vectorization_factor (loop_vinfo);
2146   if (!ok)
2147     {
2148       if (dump_enabled_p ())
2149         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150                          "can't determine vectorization factor.\n");
2151       return false;
2152     }
2153   if (max_vf != MAX_VECTORIZATION_FACTOR
2154       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2155     {
2156       if (dump_enabled_p ())
2157         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2158                          "bad data dependence.\n");
2159       return false;
2160     }
2161
2162   /* Compute the scalar iteration cost.  */
2163   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2164
2165   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2166   unsigned th;
2167
2168   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2169   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2170   if (!ok)
2171     return false;
2172
2173   /* If there are any SLP instances mark them as pure_slp.  */
2174   bool slp = vect_make_slp_decision (loop_vinfo);
2175   if (slp)
2176     {
2177       /* Find stmts that need to be both vectorized and SLPed.  */
2178       vect_detect_hybrid_slp (loop_vinfo);
2179
2180       /* Update the vectorization factor based on the SLP decision.  */
2181       vect_update_vf_for_slp (loop_vinfo);
2182     }
2183
2184   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2185
2186   /* We don't expect to have to roll back to anything other than an empty
2187      set of rgroups.  */
2188   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2189
2190   /* This is the point where we can re-start analysis with SLP forced off.  */
2191 start_over:
2192
2193   /* Now the vectorization factor is final.  */
2194   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2195   gcc_assert (known_ne (vectorization_factor, 0U));
2196
2197   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2198     {
2199       dump_printf_loc (MSG_NOTE, vect_location,
2200                        "vectorization_factor = ");
2201       dump_dec (MSG_NOTE, vectorization_factor);
2202       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2203                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2204     }
2205
2206   HOST_WIDE_INT max_niter
2207     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2208
2209   /* Analyze the alignment of the data-refs in the loop.
2210      Fail if a data reference is found that cannot be vectorized.  */
2211
2212   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2213   if (!ok)
2214     {
2215       if (dump_enabled_p ())
2216         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2217                          "bad data alignment.\n");
2218       return false;
2219     }
2220
2221   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2222      It is important to call pruning after vect_analyze_data_ref_accesses,
2223      since we use grouping information gathered by interleaving analysis.  */
2224   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2225   if (!ok)
2226     return false;
2227
2228   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2229      vectorization.  */
2230   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2231     {
2232     /* This pass will decide on using loop versioning and/or loop peeling in
2233        order to enhance the alignment of data references in the loop.  */
2234     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2235     if (!ok)
2236       {
2237         if (dump_enabled_p ())
2238           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239                            "bad data alignment.\n");
2240         return false;
2241       }
2242     }
2243
2244   if (slp)
2245     {
2246       /* Analyze operations in the SLP instances.  Note this may
2247          remove unsupported SLP instances which makes the above
2248          SLP kind detection invalid.  */
2249       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2250       vect_slp_analyze_operations (loop_vinfo);
2251       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2252         goto again;
2253     }
2254
2255   /* Scan all the remaining operations in the loop that are not subject
2256      to SLP and make sure they are vectorizable.  */
2257   ok = vect_analyze_loop_operations (loop_vinfo);
2258   if (!ok)
2259     {
2260       if (dump_enabled_p ())
2261         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262                          "bad operation or unsupported loop bound.\n");
2263       return false;
2264     }
2265
2266   /* Decide whether to use a fully-masked loop for this vectorization
2267      factor.  */
2268   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2269     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2270        && vect_verify_full_masking (loop_vinfo));
2271   if (dump_enabled_p ())
2272     {
2273       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2274         dump_printf_loc (MSG_NOTE, vect_location,
2275                          "using a fully-masked loop.\n");
2276       else
2277         dump_printf_loc (MSG_NOTE, vect_location,
2278                          "not using a fully-masked loop.\n");
2279     }
2280
2281   /* If epilog loop is required because of data accesses with gaps,
2282      one additional iteration needs to be peeled.  Check if there is
2283      enough iterations for vectorization.  */
2284   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2285       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2286       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2287     {
2288       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2289       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2290
2291       if (known_lt (wi::to_widest (scalar_niters), vf))
2292         {
2293           if (dump_enabled_p ())
2294             dump_printf_loc (MSG_NOTE, vect_location,
2295                              "loop has no enough iterations to support"
2296                              " peeling for gaps.\n");
2297           return false;
2298         }
2299     }
2300
2301   /* Check the costings of the loop make vectorizing worthwhile.  */
2302   res = vect_analyze_loop_costing (loop_vinfo);
2303   if (res < 0)
2304     goto again;
2305   if (!res)
2306     {
2307       if (dump_enabled_p ())
2308         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309                          "Loop costings not worthwhile.\n");
2310       return false;
2311     }
2312
2313   /* Decide whether we need to create an epilogue loop to handle
2314      remaining scalar iterations.  */
2315   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2316
2317   unsigned HOST_WIDE_INT const_vf;
2318   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2319     /* The main loop handles all iterations.  */
2320     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2321   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2322            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2323     {
2324       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2325                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2326                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2327         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2328     }
2329   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2330            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2331            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2332                 < (unsigned) exact_log2 (const_vf))
2333                /* In case of versioning, check if the maximum number of
2334                   iterations is greater than th.  If they are identical,
2335                   the epilogue is unnecessary.  */
2336                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2337                    || ((unsigned HOST_WIDE_INT) max_niter
2338                        > (th / const_vf) * const_vf))))
2339     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2340
2341   /* If an epilogue loop is required make sure we can create one.  */
2342   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2343       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2344     {
2345       if (dump_enabled_p ())
2346         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2347       if (!vect_can_advance_ivs_p (loop_vinfo)
2348           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2349                                            single_exit (LOOP_VINFO_LOOP
2350                                                          (loop_vinfo))))
2351         {
2352           if (dump_enabled_p ())
2353             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354                              "not vectorized: can't create required "
2355                              "epilog loop\n");
2356           goto again;
2357         }
2358     }
2359
2360   /* During peeling, we need to check if number of loop iterations is
2361      enough for both peeled prolog loop and vector loop.  This check
2362      can be merged along with threshold check of loop versioning, so
2363      increase threshold for this case if necessary.  */
2364   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2365     {
2366       poly_uint64 niters_th = 0;
2367
2368       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2369         {
2370           /* Niters for peeled prolog loop.  */
2371           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2372             {
2373               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2374               tree vectype
2375                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2376               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2377             }
2378           else
2379             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2380         }
2381
2382       /* Niters for at least one iteration of vectorized loop.  */
2383       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2384         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2385       /* One additional iteration because of peeling for gap.  */
2386       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2387         niters_th += 1;
2388       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2389     }
2390
2391   gcc_assert (known_eq (vectorization_factor,
2392                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2393
2394   /* Ok to vectorize!  */
2395   return true;
2396
2397 again:
2398   /* Try again with SLP forced off but if we didn't do any SLP there is
2399      no point in re-trying.  */
2400   if (!slp)
2401     return false;
2402
2403   /* If there are reduction chains re-trying will fail anyway.  */
2404   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2405     return false;
2406
2407   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2408      via interleaving or lane instructions.  */
2409   slp_instance instance;
2410   slp_tree node;
2411   unsigned i, j;
2412   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2413     {
2414       stmt_vec_info vinfo;
2415       vinfo = vinfo_for_stmt
2416           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2417       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2418         continue;
2419       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2420       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2421       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2422       if (! vect_store_lanes_supported (vectype, size, false)
2423           && ! vect_grouped_store_supported (vectype, size))
2424         return false;
2425       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2426         {
2427           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2428           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2429           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2430           size = STMT_VINFO_GROUP_SIZE (vinfo);
2431           vectype = STMT_VINFO_VECTYPE (vinfo);
2432           if (! vect_load_lanes_supported (vectype, size, false)
2433               && ! vect_grouped_load_supported (vectype, single_element_p,
2434                                                 size))
2435             return false;
2436         }
2437     }
2438
2439   if (dump_enabled_p ())
2440     dump_printf_loc (MSG_NOTE, vect_location,
2441                      "re-trying with SLP disabled\n");
2442
2443   /* Roll back state appropriately.  No SLP this time.  */
2444   slp = false;
2445   /* Restore vectorization factor as it were without SLP.  */
2446   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2447   /* Free the SLP instances.  */
2448   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2449     vect_free_slp_instance (instance);
2450   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2451   /* Reset SLP type to loop_vect on all stmts.  */
2452   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2453     {
2454       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2455       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2456            !gsi_end_p (si); gsi_next (&si))
2457         {
2458           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2459           STMT_SLP_TYPE (stmt_info) = loop_vect;
2460         }
2461       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2462            !gsi_end_p (si); gsi_next (&si))
2463         {
2464           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2465           STMT_SLP_TYPE (stmt_info) = loop_vect;
2466           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2467             {
2468               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2469               STMT_SLP_TYPE (stmt_info) = loop_vect;
2470               for (gimple_stmt_iterator pi
2471                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2472                    !gsi_end_p (pi); gsi_next (&pi))
2473                 {
2474                   gimple *pstmt = gsi_stmt (pi);
2475                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2476                 }
2477             }
2478         }
2479     }
2480   /* Free optimized alias test DDRS.  */
2481   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2482   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2483   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2484   /* Reset target cost data.  */
2485   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2486   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2487     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2488   /* Reset accumulated rgroup information.  */
2489   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2490   /* Reset assorted flags.  */
2491   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2492   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2493   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2494   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2495   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2496
2497   goto start_over;
2498 }
2499
2500 /* Function vect_analyze_loop.
2501
2502    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2503    for it.  The different analyses will record information in the
2504    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2505    be vectorized.  */
2506 loop_vec_info
2507 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2508 {
2509   loop_vec_info loop_vinfo;
2510   auto_vector_sizes vector_sizes;
2511
2512   /* Autodetect first vector size we try.  */
2513   current_vector_size = 0;
2514   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2515   unsigned int next_size = 0;
2516
2517   if (dump_enabled_p ())
2518     dump_printf_loc (MSG_NOTE, vect_location,
2519                      "===== analyze_loop_nest =====\n");
2520
2521   if (loop_outer (loop)
2522       && loop_vec_info_for_loop (loop_outer (loop))
2523       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2524     {
2525       if (dump_enabled_p ())
2526         dump_printf_loc (MSG_NOTE, vect_location,
2527                          "outer-loop already vectorized.\n");
2528       return NULL;
2529     }
2530
2531   poly_uint64 autodetected_vector_size = 0;
2532   while (1)
2533     {
2534       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2535       loop_vinfo = vect_analyze_loop_form (loop);
2536       if (!loop_vinfo)
2537         {
2538           if (dump_enabled_p ())
2539             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2540                              "bad loop form.\n");
2541           return NULL;
2542         }
2543
2544       bool fatal = false;
2545
2546       if (orig_loop_vinfo)
2547         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2548
2549       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2550         {
2551           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2552
2553           return loop_vinfo;
2554         }
2555
2556       delete loop_vinfo;
2557
2558       if (next_size == 0)
2559         autodetected_vector_size = current_vector_size;
2560
2561       if (next_size < vector_sizes.length ()
2562           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2563         next_size += 1;
2564
2565       if (fatal
2566           || next_size == vector_sizes.length ()
2567           || known_eq (current_vector_size, 0U))
2568         return NULL;
2569
2570       /* Try the next biggest vector size.  */
2571       current_vector_size = vector_sizes[next_size++];
2572       if (dump_enabled_p ())
2573         {
2574           dump_printf_loc (MSG_NOTE, vect_location,
2575                            "***** Re-trying analysis with "
2576                            "vector size ");
2577           dump_dec (MSG_NOTE, current_vector_size);
2578           dump_printf (MSG_NOTE, "\n");
2579         }
2580     }
2581 }
2582
2583 /* Return true if there is an in-order reduction function for CODE, storing
2584    it in *REDUC_FN if so.  */
2585
2586 static bool
2587 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2588 {
2589   switch (code)
2590     {
2591     case PLUS_EXPR:
2592       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2593       return true;
2594
2595     default:
2596       return false;
2597     }
2598 }
2599
2600 /* Function reduction_fn_for_scalar_code
2601
2602    Input:
2603    CODE - tree_code of a reduction operations.
2604
2605    Output:
2606    REDUC_FN - the corresponding internal function to be used to reduce the
2607       vector of partial results into a single scalar result, or IFN_LAST
2608       if the operation is a supported reduction operation, but does not have
2609       such an internal function.
2610
2611    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2612
2613 static bool
2614 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2615 {
2616   switch (code)
2617     {
2618       case MAX_EXPR:
2619         *reduc_fn = IFN_REDUC_MAX;
2620         return true;
2621
2622       case MIN_EXPR:
2623         *reduc_fn = IFN_REDUC_MIN;
2624         return true;
2625
2626       case PLUS_EXPR:
2627         *reduc_fn = IFN_REDUC_PLUS;
2628         return true;
2629
2630       case BIT_AND_EXPR:
2631         *reduc_fn = IFN_REDUC_AND;
2632         return true;
2633
2634       case BIT_IOR_EXPR:
2635         *reduc_fn = IFN_REDUC_IOR;
2636         return true;
2637
2638       case BIT_XOR_EXPR:
2639         *reduc_fn = IFN_REDUC_XOR;
2640         return true;
2641
2642       case MULT_EXPR:
2643       case MINUS_EXPR:
2644         *reduc_fn = IFN_LAST;
2645         return true;
2646
2647       default:
2648        return false;
2649     }
2650 }
2651
2652 /* If there is a neutral value X such that SLP reduction NODE would not
2653    be affected by the introduction of additional X elements, return that X,
2654    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2655    is true if the SLP statements perform a single reduction, false if each
2656    statement performs an independent reduction.  */
2657
2658 static tree
2659 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2660                               bool reduc_chain)
2661 {
2662   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2663   gimple *stmt = stmts[0];
2664   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2665   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2666   tree scalar_type = TREE_TYPE (vector_type);
2667   struct loop *loop = gimple_bb (stmt)->loop_father;
2668   gcc_assert (loop);
2669
2670   switch (code)
2671     {
2672     case WIDEN_SUM_EXPR:
2673     case DOT_PROD_EXPR:
2674     case SAD_EXPR:
2675     case PLUS_EXPR:
2676     case MINUS_EXPR:
2677     case BIT_IOR_EXPR:
2678     case BIT_XOR_EXPR:
2679       return build_zero_cst (scalar_type);
2680
2681     case MULT_EXPR:
2682       return build_one_cst (scalar_type);
2683
2684     case BIT_AND_EXPR:
2685       return build_all_ones_cst (scalar_type);
2686
2687     case MAX_EXPR:
2688     case MIN_EXPR:
2689       /* For MIN/MAX the initial values are neutral.  A reduction chain
2690          has only a single initial value, so that value is neutral for
2691          all statements.  */
2692       if (reduc_chain)
2693         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2694       return NULL_TREE;
2695
2696     default:
2697       return NULL_TREE;
2698     }
2699 }
2700
2701 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2702    STMT is printed with a message MSG. */
2703
2704 static void
2705 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2706 {
2707   dump_printf_loc (msg_type, vect_location, "%s", msg);
2708   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2709 }
2710
2711
2712 /* Detect SLP reduction of the form:
2713
2714    #a1 = phi <a5, a0>
2715    a2 = operation (a1)
2716    a3 = operation (a2)
2717    a4 = operation (a3)
2718    a5 = operation (a4)
2719
2720    #a = phi <a5>
2721
2722    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2723    FIRST_STMT is the first reduction stmt in the chain
2724    (a2 = operation (a1)).
2725
2726    Return TRUE if a reduction chain was detected.  */
2727
2728 static bool
2729 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2730                        gimple *first_stmt)
2731 {
2732   struct loop *loop = (gimple_bb (phi))->loop_father;
2733   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2734   enum tree_code code;
2735   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2736   stmt_vec_info use_stmt_info, current_stmt_info;
2737   tree lhs;
2738   imm_use_iterator imm_iter;
2739   use_operand_p use_p;
2740   int nloop_uses, size = 0, n_out_of_loop_uses;
2741   bool found = false;
2742
2743   if (loop != vect_loop)
2744     return false;
2745
2746   lhs = PHI_RESULT (phi);
2747   code = gimple_assign_rhs_code (first_stmt);
2748   while (1)
2749     {
2750       nloop_uses = 0;
2751       n_out_of_loop_uses = 0;
2752       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2753         {
2754           gimple *use_stmt = USE_STMT (use_p);
2755           if (is_gimple_debug (use_stmt))
2756             continue;
2757
2758           /* Check if we got back to the reduction phi.  */
2759           if (use_stmt == phi)
2760             {
2761               loop_use_stmt = use_stmt;
2762               found = true;
2763               break;
2764             }
2765
2766           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2767             {
2768               loop_use_stmt = use_stmt;
2769               nloop_uses++;
2770             }
2771            else
2772              n_out_of_loop_uses++;
2773
2774            /* There are can be either a single use in the loop or two uses in
2775               phi nodes.  */
2776            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2777              return false;
2778         }
2779
2780       if (found)
2781         break;
2782
2783       /* We reached a statement with no loop uses.  */
2784       if (nloop_uses == 0)
2785         return false;
2786
2787       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2788       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2789         return false;
2790
2791       if (!is_gimple_assign (loop_use_stmt)
2792           || code != gimple_assign_rhs_code (loop_use_stmt)
2793           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2794         return false;
2795
2796       /* Insert USE_STMT into reduction chain.  */
2797       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2798       if (current_stmt)
2799         {
2800           current_stmt_info = vinfo_for_stmt (current_stmt);
2801           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2802           GROUP_FIRST_ELEMENT (use_stmt_info)
2803             = GROUP_FIRST_ELEMENT (current_stmt_info);
2804         }
2805       else
2806         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2807
2808       lhs = gimple_assign_lhs (loop_use_stmt);
2809       current_stmt = loop_use_stmt;
2810       size++;
2811    }
2812
2813   if (!found || loop_use_stmt != phi || size < 2)
2814     return false;
2815
2816   /* Swap the operands, if needed, to make the reduction operand be the second
2817      operand.  */
2818   lhs = PHI_RESULT (phi);
2819   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2820   while (next_stmt)
2821     {
2822       if (gimple_assign_rhs2 (next_stmt) == lhs)
2823         {
2824           tree op = gimple_assign_rhs1 (next_stmt);
2825           gimple *def_stmt = NULL;
2826
2827           if (TREE_CODE (op) == SSA_NAME)
2828             def_stmt = SSA_NAME_DEF_STMT (op);
2829
2830           /* Check that the other def is either defined in the loop
2831              ("vect_internal_def"), or it's an induction (defined by a
2832              loop-header phi-node).  */
2833           if (def_stmt
2834               && gimple_bb (def_stmt)
2835               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2836               && (is_gimple_assign (def_stmt)
2837                   || is_gimple_call (def_stmt)
2838                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2839                            == vect_induction_def
2840                   || (gimple_code (def_stmt) == GIMPLE_PHI
2841                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2842                                   == vect_internal_def
2843                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2844             {
2845               lhs = gimple_assign_lhs (next_stmt);
2846               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2847               continue;
2848             }
2849
2850           return false;
2851         }
2852       else
2853         {
2854           tree op = gimple_assign_rhs2 (next_stmt);
2855           gimple *def_stmt = NULL;
2856
2857           if (TREE_CODE (op) == SSA_NAME)
2858             def_stmt = SSA_NAME_DEF_STMT (op);
2859
2860           /* Check that the other def is either defined in the loop
2861             ("vect_internal_def"), or it's an induction (defined by a
2862             loop-header phi-node).  */
2863           if (def_stmt
2864               && gimple_bb (def_stmt)
2865               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2866               && (is_gimple_assign (def_stmt)
2867                   || is_gimple_call (def_stmt)
2868                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2869                               == vect_induction_def
2870                   || (gimple_code (def_stmt) == GIMPLE_PHI
2871                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2872                                   == vect_internal_def
2873                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2874             {
2875               if (dump_enabled_p ())
2876                 {
2877                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2878                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2879                 }
2880
2881               swap_ssa_operands (next_stmt,
2882                                  gimple_assign_rhs1_ptr (next_stmt),
2883                                  gimple_assign_rhs2_ptr (next_stmt));
2884               update_stmt (next_stmt);
2885
2886               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2887                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2888             }
2889           else
2890             return false;
2891         }
2892
2893       lhs = gimple_assign_lhs (next_stmt);
2894       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2895     }
2896
2897   /* Save the chain for further analysis in SLP detection.  */
2898   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2899   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2900   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2901
2902   return true;
2903 }
2904
2905 /* Return true if we need an in-order reduction for operation CODE
2906    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2907    overflow must wrap.  */
2908
2909 static bool
2910 needs_fold_left_reduction_p (tree type, tree_code code,
2911                              bool need_wrapping_integral_overflow)
2912 {
2913   /* CHECKME: check for !flag_finite_math_only too?  */
2914   if (SCALAR_FLOAT_TYPE_P (type))
2915     switch (code)
2916       {
2917       case MIN_EXPR:
2918       case MAX_EXPR:
2919         return false;
2920
2921       default:
2922         return !flag_associative_math;
2923       }
2924
2925   if (INTEGRAL_TYPE_P (type))
2926     {
2927       if (!operation_no_trapping_overflow (type, code))
2928         return true;
2929       if (need_wrapping_integral_overflow
2930           && !TYPE_OVERFLOW_WRAPS (type)
2931           && operation_can_overflow (code))
2932         return true;
2933       return false;
2934     }
2935
2936   if (SAT_FIXED_POINT_TYPE_P (type))
2937     return true;
2938
2939   return false;
2940 }
2941
2942 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2943    reduction operation CODE has a handled computation expression.  */
2944
2945 bool
2946 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2947                       enum tree_code code)
2948 {
2949   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2950   auto_bitmap visited;
2951   tree lookfor = PHI_RESULT (phi);
2952   ssa_op_iter curri;
2953   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2954   while (USE_FROM_PTR (curr) != loop_arg)
2955     curr = op_iter_next_use (&curri);
2956   curri.i = curri.numops;
2957   do
2958     {
2959       path.safe_push (std::make_pair (curri, curr));
2960       tree use = USE_FROM_PTR (curr);
2961       if (use == lookfor)
2962         break;
2963       gimple *def = SSA_NAME_DEF_STMT (use);
2964       if (gimple_nop_p (def)
2965           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2966         {
2967 pop:
2968           do
2969             {
2970               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2971               curri = x.first;
2972               curr = x.second;
2973               do
2974                 curr = op_iter_next_use (&curri);
2975               /* Skip already visited or non-SSA operands (from iterating
2976                  over PHI args).  */
2977               while (curr != NULL_USE_OPERAND_P
2978                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2979                          || ! bitmap_set_bit (visited,
2980                                               SSA_NAME_VERSION
2981                                                 (USE_FROM_PTR (curr)))));
2982             }
2983           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2984           if (curr == NULL_USE_OPERAND_P)
2985             break;
2986         }
2987       else
2988         {
2989           if (gimple_code (def) == GIMPLE_PHI)
2990             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2991           else
2992             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2993           while (curr != NULL_USE_OPERAND_P
2994                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2995                      || ! bitmap_set_bit (visited,
2996                                           SSA_NAME_VERSION
2997                                             (USE_FROM_PTR (curr)))))
2998             curr = op_iter_next_use (&curri);
2999           if (curr == NULL_USE_OPERAND_P)
3000             goto pop;
3001         }
3002     }
3003   while (1);
3004   if (dump_file && (dump_flags & TDF_DETAILS))
3005     {
3006       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3007       unsigned i;
3008       std::pair<ssa_op_iter, use_operand_p> *x;
3009       FOR_EACH_VEC_ELT (path, i, x)
3010         {
3011           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3012           dump_printf (MSG_NOTE, " ");
3013         }
3014       dump_printf (MSG_NOTE, "\n");
3015     }
3016
3017   /* Check whether the reduction path detected is valid.  */
3018   bool fail = path.length () == 0;
3019   bool neg = false;
3020   for (unsigned i = 1; i < path.length (); ++i)
3021     {
3022       gimple *use_stmt = USE_STMT (path[i].second);
3023       tree op = USE_FROM_PTR (path[i].second);
3024       if (! has_single_use (op)
3025           || ! is_gimple_assign (use_stmt))
3026         {
3027           fail = true;
3028           break;
3029         }
3030       if (gimple_assign_rhs_code (use_stmt) != code)
3031         {
3032           if (code == PLUS_EXPR
3033               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3034             {
3035               /* Track whether we negate the reduction value each iteration.  */
3036               if (gimple_assign_rhs2 (use_stmt) == op)
3037                 neg = ! neg;
3038             }
3039           else
3040             {
3041               fail = true;
3042               break;
3043             }
3044         }
3045     }
3046   return ! fail && ! neg;
3047 }
3048
3049
3050 /* Function vect_is_simple_reduction
3051
3052    (1) Detect a cross-iteration def-use cycle that represents a simple
3053    reduction computation.  We look for the following pattern:
3054
3055    loop_header:
3056      a1 = phi < a0, a2 >
3057      a3 = ...
3058      a2 = operation (a3, a1)
3059
3060    or
3061
3062    a3 = ...
3063    loop_header:
3064      a1 = phi < a0, a2 >
3065      a2 = operation (a3, a1)
3066
3067    such that:
3068    1. operation is commutative and associative and it is safe to
3069       change the order of the computation
3070    2. no uses for a2 in the loop (a2 is used out of the loop)
3071    3. no uses of a1 in the loop besides the reduction operation
3072    4. no uses of a1 outside the loop.
3073
3074    Conditions 1,4 are tested here.
3075    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3076
3077    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3078    nested cycles.
3079
3080    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3081    reductions:
3082
3083      a1 = phi < a0, a2 >
3084      inner loop (def of a3)
3085      a2 = phi < a3 >
3086
3087    (4) Detect condition expressions, ie:
3088      for (int i = 0; i < N; i++)
3089        if (a[i] < val)
3090         ret_val = a[i];
3091
3092 */
3093
3094 static gimple *
3095 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3096                           bool *double_reduc,
3097                           bool need_wrapping_integral_overflow,
3098                           enum vect_reduction_type *v_reduc_type)
3099 {
3100   struct loop *loop = (gimple_bb (phi))->loop_father;
3101   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3102   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3103   enum tree_code orig_code, code;
3104   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3105   tree type;
3106   int nloop_uses;
3107   tree name;
3108   imm_use_iterator imm_iter;
3109   use_operand_p use_p;
3110   bool phi_def;
3111
3112   *double_reduc = false;
3113   *v_reduc_type = TREE_CODE_REDUCTION;
3114
3115   tree phi_name = PHI_RESULT (phi);
3116   /* ???  If there are no uses of the PHI result the inner loop reduction
3117      won't be detected as possibly double-reduction by vectorizable_reduction
3118      because that tries to walk the PHI arg from the preheader edge which
3119      can be constant.  See PR60382.  */
3120   if (has_zero_uses (phi_name))
3121     return NULL;
3122   nloop_uses = 0;
3123   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3124     {
3125       gimple *use_stmt = USE_STMT (use_p);
3126       if (is_gimple_debug (use_stmt))
3127         continue;
3128
3129       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3130         {
3131           if (dump_enabled_p ())
3132             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3133                              "intermediate value used outside loop.\n");
3134
3135           return NULL;
3136         }
3137
3138       nloop_uses++;
3139       if (nloop_uses > 1)
3140         {
3141           if (dump_enabled_p ())
3142             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3143                              "reduction value used in loop.\n");
3144           return NULL;
3145         }
3146
3147       phi_use_stmt = use_stmt;
3148     }
3149
3150   edge latch_e = loop_latch_edge (loop);
3151   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3152   if (TREE_CODE (loop_arg) != SSA_NAME)
3153     {
3154       if (dump_enabled_p ())
3155         {
3156           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3157                            "reduction: not ssa_name: ");
3158           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3159           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3160         }
3161       return NULL;
3162     }
3163
3164   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3165   if (is_gimple_assign (def_stmt))
3166     {
3167       name = gimple_assign_lhs (def_stmt);
3168       phi_def = false;
3169     }
3170   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3171     {
3172       name = PHI_RESULT (def_stmt);
3173       phi_def = true;
3174     }
3175   else
3176     {
3177       if (dump_enabled_p ())
3178         {
3179           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3180                            "reduction: unhandled reduction operation: ");
3181           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3182         }
3183       return NULL;
3184     }
3185
3186   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3187     return NULL;
3188
3189   nloop_uses = 0;
3190   auto_vec<gphi *, 3> lcphis;
3191   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3192     {
3193       gimple *use_stmt = USE_STMT (use_p);
3194       if (is_gimple_debug (use_stmt))
3195         continue;
3196       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3197         nloop_uses++;
3198       else
3199         /* We can have more than one loop-closed PHI.  */
3200         lcphis.safe_push (as_a <gphi *> (use_stmt));
3201       if (nloop_uses > 1)
3202         {
3203           if (dump_enabled_p ())
3204             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3205                              "reduction used in loop.\n");
3206           return NULL;
3207         }
3208     }
3209
3210   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3211      defined in the inner loop.  */
3212   if (phi_def)
3213     {
3214       op1 = PHI_ARG_DEF (def_stmt, 0);
3215
3216       if (gimple_phi_num_args (def_stmt) != 1
3217           || TREE_CODE (op1) != SSA_NAME)
3218         {
3219           if (dump_enabled_p ())
3220             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3221                              "unsupported phi node definition.\n");
3222
3223           return NULL;
3224         }
3225
3226       def1 = SSA_NAME_DEF_STMT (op1);
3227       if (gimple_bb (def1)
3228           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3229           && loop->inner
3230           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3231           && is_gimple_assign (def1)
3232           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3233         {
3234           if (dump_enabled_p ())
3235             report_vect_op (MSG_NOTE, def_stmt,
3236                             "detected double reduction: ");
3237
3238           *double_reduc = true;
3239           return def_stmt;
3240         }
3241
3242       return NULL;
3243     }
3244
3245   /* If we are vectorizing an inner reduction we are executing that
3246      in the original order only in case we are not dealing with a
3247      double reduction.  */
3248   bool check_reduction = true;
3249   if (flow_loop_nested_p (vect_loop, loop))
3250     {
3251       gphi *lcphi;
3252       unsigned i;
3253       check_reduction = false;
3254       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3255         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3256           {
3257             gimple *use_stmt = USE_STMT (use_p);
3258             if (is_gimple_debug (use_stmt))
3259               continue;
3260             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3261               check_reduction = true;
3262           }
3263     }
3264
3265   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3266   code = orig_code = gimple_assign_rhs_code (def_stmt);
3267
3268   /* We can handle "res -= x[i]", which is non-associative by
3269      simply rewriting this into "res += -x[i]".  Avoid changing
3270      gimple instruction for the first simple tests and only do this
3271      if we're allowed to change code at all.  */
3272   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3273     code = PLUS_EXPR;
3274
3275   if (code == COND_EXPR)
3276     {
3277       if (! nested_in_vect_loop)
3278         *v_reduc_type = COND_REDUCTION;
3279
3280       op3 = gimple_assign_rhs1 (def_stmt);
3281       if (COMPARISON_CLASS_P (op3))
3282         {
3283           op4 = TREE_OPERAND (op3, 1);
3284           op3 = TREE_OPERAND (op3, 0);
3285         }
3286       if (op3 == phi_name || op4 == phi_name)
3287         {
3288           if (dump_enabled_p ())
3289             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3290                             "reduction: condition depends on previous"
3291                             " iteration: ");
3292           return NULL;
3293         }
3294
3295       op1 = gimple_assign_rhs2 (def_stmt);
3296       op2 = gimple_assign_rhs3 (def_stmt);
3297     }
3298   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3299     {
3300       if (dump_enabled_p ())
3301         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3302                         "reduction: not commutative/associative: ");
3303       return NULL;
3304     }
3305   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3306     {
3307       op1 = gimple_assign_rhs1 (def_stmt);
3308       op2 = gimple_assign_rhs2 (def_stmt);
3309     }
3310   else
3311     {
3312       if (dump_enabled_p ())
3313         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3314                         "reduction: not handled operation: ");
3315       return NULL;
3316     }
3317
3318   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3319     {
3320       if (dump_enabled_p ())
3321         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3322                         "reduction: both uses not ssa_names: ");
3323
3324       return NULL;
3325     }
3326
3327   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3328   if ((TREE_CODE (op1) == SSA_NAME
3329        && !types_compatible_p (type,TREE_TYPE (op1)))
3330       || (TREE_CODE (op2) == SSA_NAME
3331           && !types_compatible_p (type, TREE_TYPE (op2)))
3332       || (op3 && TREE_CODE (op3) == SSA_NAME
3333           && !types_compatible_p (type, TREE_TYPE (op3)))
3334       || (op4 && TREE_CODE (op4) == SSA_NAME
3335           && !types_compatible_p (type, TREE_TYPE (op4))))
3336     {
3337       if (dump_enabled_p ())
3338         {
3339           dump_printf_loc (MSG_NOTE, vect_location,
3340                            "reduction: multiple types: operation type: ");
3341           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3342           dump_printf (MSG_NOTE, ", operands types: ");
3343           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3344                              TREE_TYPE (op1));
3345           dump_printf (MSG_NOTE, ",");
3346           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3347                              TREE_TYPE (op2));
3348           if (op3)
3349             {
3350               dump_printf (MSG_NOTE, ",");
3351               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3352                                  TREE_TYPE (op3));
3353             }
3354
3355           if (op4)
3356             {
3357               dump_printf (MSG_NOTE, ",");
3358               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3359                                  TREE_TYPE (op4));
3360             }
3361           dump_printf (MSG_NOTE, "\n");
3362         }
3363
3364       return NULL;
3365     }
3366
3367   /* Check whether it's ok to change the order of the computation.
3368      Generally, when vectorizing a reduction we change the order of the
3369      computation.  This may change the behavior of the program in some
3370      cases, so we need to check that this is ok.  One exception is when
3371      vectorizing an outer-loop: the inner-loop is executed sequentially,
3372      and therefore vectorizing reductions in the inner-loop during
3373      outer-loop vectorization is safe.  */
3374   if (check_reduction
3375       && *v_reduc_type == TREE_CODE_REDUCTION
3376       && needs_fold_left_reduction_p (type, code,
3377                                       need_wrapping_integral_overflow))
3378     *v_reduc_type = FOLD_LEFT_REDUCTION;
3379
3380   /* Reduction is safe. We're dealing with one of the following:
3381      1) integer arithmetic and no trapv
3382      2) floating point arithmetic, and special flags permit this optimization
3383      3) nested cycle (i.e., outer loop vectorization).  */
3384   if (TREE_CODE (op1) == SSA_NAME)
3385     def1 = SSA_NAME_DEF_STMT (op1);
3386
3387   if (TREE_CODE (op2) == SSA_NAME)
3388     def2 = SSA_NAME_DEF_STMT (op2);
3389
3390   if (code != COND_EXPR
3391       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3392     {
3393       if (dump_enabled_p ())
3394         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3395       return NULL;
3396     }
3397
3398   /* Check that one def is the reduction def, defined by PHI,
3399      the other def is either defined in the loop ("vect_internal_def"),
3400      or it's an induction (defined by a loop-header phi-node).  */
3401
3402   if (def2 && def2 == phi
3403       && (code == COND_EXPR
3404           || !def1 || gimple_nop_p (def1)
3405           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3406           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3407               && (is_gimple_assign (def1)
3408                   || is_gimple_call (def1)
3409                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3410                       == vect_induction_def
3411                   || (gimple_code (def1) == GIMPLE_PHI
3412                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3413                           == vect_internal_def
3414                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3415     {
3416       if (dump_enabled_p ())
3417         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3418       return def_stmt;
3419     }
3420
3421   if (def1 && def1 == phi
3422       && (code == COND_EXPR
3423           || !def2 || gimple_nop_p (def2)
3424           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3425           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3426               && (is_gimple_assign (def2)
3427                   || is_gimple_call (def2)
3428                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3429                        == vect_induction_def
3430                   || (gimple_code (def2) == GIMPLE_PHI
3431                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3432                            == vect_internal_def
3433                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3434     {
3435       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3436         {
3437           /* Check if we can swap operands (just for simplicity - so that
3438              the rest of the code can assume that the reduction variable
3439              is always the last (second) argument).  */
3440           if (code == COND_EXPR)
3441             {
3442               /* Swap cond_expr by inverting the condition.  */
3443               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3444               enum tree_code invert_code = ERROR_MARK;
3445               enum tree_code cond_code = TREE_CODE (cond_expr);
3446
3447               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3448                 {
3449                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3450                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3451                 }
3452               if (invert_code != ERROR_MARK)
3453                 {
3454                   TREE_SET_CODE (cond_expr, invert_code);
3455                   swap_ssa_operands (def_stmt,
3456                                      gimple_assign_rhs2_ptr (def_stmt),
3457                                      gimple_assign_rhs3_ptr (def_stmt));
3458                 }
3459               else
3460                 {
3461                   if (dump_enabled_p ())
3462                     report_vect_op (MSG_NOTE, def_stmt,
3463                                     "detected reduction: cannot swap operands "
3464                                     "for cond_expr");
3465                   return NULL;
3466                 }
3467             }
3468           else
3469             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3470                                gimple_assign_rhs2_ptr (def_stmt));
3471
3472           if (dump_enabled_p ())
3473             report_vect_op (MSG_NOTE, def_stmt,
3474                             "detected reduction: need to swap operands: ");
3475
3476           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3477             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3478         }
3479       else
3480         {
3481           if (dump_enabled_p ())
3482             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3483         }
3484
3485       return def_stmt;
3486     }
3487
3488   /* Try to find SLP reduction chain.  */
3489   if (! nested_in_vect_loop
3490       && code != COND_EXPR
3491       && orig_code != MINUS_EXPR
3492       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3493     {
3494       if (dump_enabled_p ())
3495         report_vect_op (MSG_NOTE, def_stmt,
3496                         "reduction: detected reduction chain: ");
3497
3498       return def_stmt;
3499     }
3500
3501   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3502   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3503   while (first)
3504     {
3505       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3506       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3507       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3508       first = next;
3509     }
3510
3511   /* Look for the expression computing loop_arg from loop PHI result.  */
3512   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3513                             code))
3514     return def_stmt;
3515
3516   if (dump_enabled_p ())
3517     {
3518       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3519                       "reduction: unknown pattern: ");
3520     }
3521
3522   return NULL;
3523 }
3524
3525 /* Wrapper around vect_is_simple_reduction, which will modify code
3526    in-place if it enables detection of more reductions.  Arguments
3527    as there.  */
3528
3529 gimple *
3530 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3531                              bool *double_reduc,
3532                              bool need_wrapping_integral_overflow)
3533 {
3534   enum vect_reduction_type v_reduc_type;
3535   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3536                                           need_wrapping_integral_overflow,
3537                                           &v_reduc_type);
3538   if (def)
3539     {
3540       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3541       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3542       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3543       reduc_def_info = vinfo_for_stmt (def);
3544       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3545       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3546     }
3547   return def;
3548 }
3549
3550 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3551 int
3552 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3553                              int *peel_iters_epilogue,
3554                              stmt_vector_for_cost *scalar_cost_vec,
3555                              stmt_vector_for_cost *prologue_cost_vec,
3556                              stmt_vector_for_cost *epilogue_cost_vec)
3557 {
3558   int retval = 0;
3559   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3560
3561   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3562     {
3563       *peel_iters_epilogue = assumed_vf / 2;
3564       if (dump_enabled_p ())
3565         dump_printf_loc (MSG_NOTE, vect_location,
3566                          "cost model: epilogue peel iters set to vf/2 "
3567                          "because loop iterations are unknown .\n");
3568
3569       /* If peeled iterations are known but number of scalar loop
3570          iterations are unknown, count a taken branch per peeled loop.  */
3571       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3572                                  NULL, 0, vect_prologue);
3573       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3574                                  NULL, 0, vect_epilogue);
3575     }
3576   else
3577     {
3578       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3579       peel_iters_prologue = niters < peel_iters_prologue ?
3580                             niters : peel_iters_prologue;
3581       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3582       /* If we need to peel for gaps, but no peeling is required, we have to
3583          peel VF iterations.  */
3584       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3585         *peel_iters_epilogue = assumed_vf;
3586     }
3587
3588   stmt_info_for_cost *si;
3589   int j;
3590   if (peel_iters_prologue)
3591     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3592         {
3593           stmt_vec_info stmt_info
3594             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3595           retval += record_stmt_cost (prologue_cost_vec,
3596                                       si->count * peel_iters_prologue,
3597                                       si->kind, stmt_info, si->misalign,
3598                                       vect_prologue);
3599         }
3600   if (*peel_iters_epilogue)
3601     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3602         {
3603           stmt_vec_info stmt_info
3604             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3605           retval += record_stmt_cost (epilogue_cost_vec,
3606                                       si->count * *peel_iters_epilogue,
3607                                       si->kind, stmt_info, si->misalign,
3608                                       vect_epilogue);
3609         }
3610
3611   return retval;
3612 }
3613
3614 /* Function vect_estimate_min_profitable_iters
3615
3616    Return the number of iterations required for the vector version of the
3617    loop to be profitable relative to the cost of the scalar version of the
3618    loop.
3619
3620    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3621    of iterations for vectorization.  -1 value means loop vectorization
3622    is not profitable.  This returned value may be used for dynamic
3623    profitability check.
3624
3625    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3626    for static check against estimated number of iterations.  */
3627
3628 static void
3629 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3630                                     int *ret_min_profitable_niters,
3631                                     int *ret_min_profitable_estimate)
3632 {
3633   int min_profitable_iters;
3634   int min_profitable_estimate;
3635   int peel_iters_prologue;
3636   int peel_iters_epilogue;
3637   unsigned vec_inside_cost = 0;
3638   int vec_outside_cost = 0;
3639   unsigned vec_prologue_cost = 0;
3640   unsigned vec_epilogue_cost = 0;
3641   int scalar_single_iter_cost = 0;
3642   int scalar_outside_cost = 0;
3643   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3644   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3645   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3646
3647   /* Cost model disabled.  */
3648   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3649     {
3650       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3651       *ret_min_profitable_niters = 0;
3652       *ret_min_profitable_estimate = 0;
3653       return;
3654     }
3655
3656   /* Requires loop versioning tests to handle misalignment.  */
3657   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3658     {
3659       /*  FIXME: Make cost depend on complexity of individual check.  */
3660       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3661       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3662                             vect_prologue);
3663       dump_printf (MSG_NOTE,
3664                    "cost model: Adding cost of checks for loop "
3665                    "versioning to treat misalignment.\n");
3666     }
3667
3668   /* Requires loop versioning with alias checks.  */
3669   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3670     {
3671       /*  FIXME: Make cost depend on complexity of individual check.  */
3672       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3673       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3674                             vect_prologue);
3675       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3676       if (len)
3677         /* Count LEN - 1 ANDs and LEN comparisons.  */
3678         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3679                               NULL, 0, vect_prologue);
3680       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3681       if (len)
3682         {
3683           /* Count LEN - 1 ANDs and LEN comparisons.  */
3684           unsigned int nstmts = len * 2 - 1;
3685           /* +1 for each bias that needs adding.  */
3686           for (unsigned int i = 0; i < len; ++i)
3687             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3688               nstmts += 1;
3689           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3690                                 NULL, 0, vect_prologue);
3691         }
3692       dump_printf (MSG_NOTE,
3693                    "cost model: Adding cost of checks for loop "
3694                    "versioning aliasing.\n");
3695     }
3696
3697   /* Requires loop versioning with niter checks.  */
3698   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3699     {
3700       /*  FIXME: Make cost depend on complexity of individual check.  */
3701       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3702                             vect_prologue);
3703       dump_printf (MSG_NOTE,
3704                    "cost model: Adding cost of checks for loop "
3705                    "versioning niters.\n");
3706     }
3707
3708   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3709     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3710                           vect_prologue);
3711
3712   /* Count statements in scalar loop.  Using this as scalar cost for a single
3713      iteration for now.
3714
3715      TODO: Add outer loop support.
3716
3717      TODO: Consider assigning different costs to different scalar
3718      statements.  */
3719
3720   scalar_single_iter_cost
3721     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3722
3723   /* Add additional cost for the peeled instructions in prologue and epilogue
3724      loop.  (For fully-masked loops there will be no peeling.)
3725
3726      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3727      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3728
3729      TODO: Build an expression that represents peel_iters for prologue and
3730      epilogue to be used in a run-time test.  */
3731
3732   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3733     {
3734       peel_iters_prologue = 0;
3735       peel_iters_epilogue = 0;
3736
3737       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3738         {
3739           /* We need to peel exactly one iteration.  */
3740           peel_iters_epilogue += 1;
3741           stmt_info_for_cost *si;
3742           int j;
3743           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3744                             j, si)
3745             {
3746               struct _stmt_vec_info *stmt_info
3747                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3748               (void) add_stmt_cost (target_cost_data, si->count,
3749                                     si->kind, stmt_info, si->misalign,
3750                                     vect_epilogue);
3751             }
3752         }
3753     }
3754   else if (npeel < 0)
3755     {
3756       peel_iters_prologue = assumed_vf / 2;
3757       dump_printf (MSG_NOTE, "cost model: "
3758                    "prologue peel iters set to vf/2.\n");
3759
3760       /* If peeling for alignment is unknown, loop bound of main loop becomes
3761          unknown.  */
3762       peel_iters_epilogue = assumed_vf / 2;
3763       dump_printf (MSG_NOTE, "cost model: "
3764                    "epilogue peel iters set to vf/2 because "
3765                    "peeling for alignment is unknown.\n");
3766
3767       /* If peeled iterations are unknown, count a taken branch and a not taken
3768          branch per peeled loop. Even if scalar loop iterations are known,
3769          vector iterations are not known since peeled prologue iterations are
3770          not known. Hence guards remain the same.  */
3771       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3772                             NULL, 0, vect_prologue);
3773       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3774                             NULL, 0, vect_prologue);
3775       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3776                             NULL, 0, vect_epilogue);
3777       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3778                             NULL, 0, vect_epilogue);
3779       stmt_info_for_cost *si;
3780       int j;
3781       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3782         {
3783           struct _stmt_vec_info *stmt_info
3784             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3785           (void) add_stmt_cost (target_cost_data,
3786                                 si->count * peel_iters_prologue,
3787                                 si->kind, stmt_info, si->misalign,
3788                                 vect_prologue);
3789           (void) add_stmt_cost (target_cost_data,
3790                                 si->count * peel_iters_epilogue,
3791                                 si->kind, stmt_info, si->misalign,
3792                                 vect_epilogue);
3793         }
3794     }
3795   else
3796     {
3797       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3798       stmt_info_for_cost *si;
3799       int j;
3800       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3801
3802       prologue_cost_vec.create (2);
3803       epilogue_cost_vec.create (2);
3804       peel_iters_prologue = npeel;
3805
3806       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3807                                           &peel_iters_epilogue,
3808                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3809                                             (loop_vinfo),
3810                                           &prologue_cost_vec,
3811                                           &epilogue_cost_vec);
3812
3813       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3814         {
3815           struct _stmt_vec_info *stmt_info
3816             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3817           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3818                                 si->misalign, vect_prologue);
3819         }
3820
3821       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3822         {
3823           struct _stmt_vec_info *stmt_info
3824             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3825           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3826                                 si->misalign, vect_epilogue);
3827         }
3828
3829       prologue_cost_vec.release ();
3830       epilogue_cost_vec.release ();
3831     }
3832
3833   /* FORNOW: The scalar outside cost is incremented in one of the
3834      following ways:
3835
3836      1. The vectorizer checks for alignment and aliasing and generates
3837      a condition that allows dynamic vectorization.  A cost model
3838      check is ANDED with the versioning condition.  Hence scalar code
3839      path now has the added cost of the versioning check.
3840
3841        if (cost > th & versioning_check)
3842          jmp to vector code
3843
3844      Hence run-time scalar is incremented by not-taken branch cost.
3845
3846      2. The vectorizer then checks if a prologue is required.  If the
3847      cost model check was not done before during versioning, it has to
3848      be done before the prologue check.
3849
3850        if (cost <= th)
3851          prologue = scalar_iters
3852        if (prologue == 0)
3853          jmp to vector code
3854        else
3855          execute prologue
3856        if (prologue == num_iters)
3857          go to exit
3858
3859      Hence the run-time scalar cost is incremented by a taken branch,
3860      plus a not-taken branch, plus a taken branch cost.
3861
3862      3. The vectorizer then checks if an epilogue is required.  If the
3863      cost model check was not done before during prologue check, it
3864      has to be done with the epilogue check.
3865
3866        if (prologue == 0)
3867          jmp to vector code
3868        else
3869          execute prologue
3870        if (prologue == num_iters)
3871          go to exit
3872        vector code:
3873          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3874            jmp to epilogue
3875
3876      Hence the run-time scalar cost should be incremented by 2 taken
3877      branches.
3878
3879      TODO: The back end may reorder the BBS's differently and reverse
3880      conditions/branch directions.  Change the estimates below to
3881      something more reasonable.  */
3882
3883   /* If the number of iterations is known and we do not do versioning, we can
3884      decide whether to vectorize at compile time.  Hence the scalar version
3885      do not carry cost model guard costs.  */
3886   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3887       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3888     {
3889       /* Cost model check occurs at versioning.  */
3890       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3891         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3892       else
3893         {
3894           /* Cost model check occurs at prologue generation.  */
3895           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3896             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3897               + vect_get_stmt_cost (cond_branch_not_taken);
3898           /* Cost model check occurs at epilogue generation.  */
3899           else
3900             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3901         }
3902     }
3903
3904   /* Complete the target-specific cost calculations.  */
3905   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3906                &vec_inside_cost, &vec_epilogue_cost);
3907
3908   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3909
3910   if (dump_enabled_p ())
3911     {
3912       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3913       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3914                    vec_inside_cost);
3915       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3916                    vec_prologue_cost);
3917       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3918                    vec_epilogue_cost);
3919       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3920                    scalar_single_iter_cost);
3921       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3922                    scalar_outside_cost);
3923       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3924                    vec_outside_cost);
3925       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3926                    peel_iters_prologue);
3927       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3928                    peel_iters_epilogue);
3929     }
3930
3931   /* Calculate number of iterations required to make the vector version
3932      profitable, relative to the loop bodies only.  The following condition
3933      must hold true:
3934      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3935      where
3936      SIC = scalar iteration cost, VIC = vector iteration cost,
3937      VOC = vector outside cost, VF = vectorization factor,
3938      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3939      SOC = scalar outside cost for run time cost model check.  */
3940
3941   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3942     {
3943       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3944                               * assumed_vf
3945                               - vec_inside_cost * peel_iters_prologue
3946                               - vec_inside_cost * peel_iters_epilogue);
3947       if (min_profitable_iters <= 0)
3948         min_profitable_iters = 0;
3949       else
3950         {
3951           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3952                                    - vec_inside_cost);
3953
3954           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3955               <= (((int) vec_inside_cost * min_profitable_iters)
3956                   + (((int) vec_outside_cost - scalar_outside_cost)
3957                      * assumed_vf)))
3958             min_profitable_iters++;
3959         }
3960     }
3961   /* vector version will never be profitable.  */
3962   else
3963     {
3964       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3965         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3966                     "did not happen for a simd loop");
3967
3968       if (dump_enabled_p ())
3969         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3970                          "cost model: the vector iteration cost = %d "
3971                          "divided by the scalar iteration cost = %d "
3972                          "is greater or equal to the vectorization factor = %d"
3973                          ".\n",
3974                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3975       *ret_min_profitable_niters = -1;
3976       *ret_min_profitable_estimate = -1;
3977       return;
3978     }
3979
3980   dump_printf (MSG_NOTE,
3981                "  Calculated minimum iters for profitability: %d\n",
3982                min_profitable_iters);
3983
3984   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3985       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3986     /* We want the vectorized loop to execute at least once.  */
3987     min_profitable_iters = assumed_vf + peel_iters_prologue;
3988
3989   if (dump_enabled_p ())
3990     dump_printf_loc (MSG_NOTE, vect_location,
3991                      "  Runtime profitability threshold = %d\n",
3992                      min_profitable_iters);
3993
3994   *ret_min_profitable_niters = min_profitable_iters;
3995
3996   /* Calculate number of iterations required to make the vector version
3997      profitable, relative to the loop bodies only.
3998
3999      Non-vectorized variant is SIC * niters and it must win over vector
4000      variant on the expected loop trip count.  The following condition must hold true:
4001      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
4002
4003   if (vec_outside_cost <= 0)
4004     min_profitable_estimate = 0;
4005   else
4006     {
4007       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4008                                  * assumed_vf
4009                                  - vec_inside_cost * peel_iters_prologue
4010                                  - vec_inside_cost * peel_iters_epilogue)
4011                                  / ((scalar_single_iter_cost * assumed_vf)
4012                                    - vec_inside_cost);
4013     }
4014   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4015   if (dump_enabled_p ())
4016     dump_printf_loc (MSG_NOTE, vect_location,
4017                      "  Static estimate profitability threshold = %d\n",
4018                      min_profitable_estimate);
4019
4020   *ret_min_profitable_estimate = min_profitable_estimate;
4021 }
4022
4023 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4024    vector elements (not bits) for a vector with NELT elements.  */
4025 static void
4026 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4027                               vec_perm_builder *sel)
4028 {
4029   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4030      by vec_perm_indices.  */
4031   sel->new_vector (nelt, 1, 3);
4032   for (unsigned int i = 0; i < 3; i++)
4033     sel->quick_push (i + offset);
4034 }
4035
4036 /* Checks whether the target supports whole-vector shifts for vectors of mode
4037    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4038    it supports vec_perm_const with masks for all necessary shift amounts.  */
4039 static bool
4040 have_whole_vector_shift (machine_mode mode)
4041 {
4042   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4043     return true;
4044
4045   /* Variable-length vectors should be handled via the optab.  */
4046   unsigned int nelt;
4047   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4048     return false;
4049
4050   vec_perm_builder sel;
4051   vec_perm_indices indices;
4052   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4053     {
4054       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4055       indices.new_vector (sel, 2, nelt);
4056       if (!can_vec_perm_const_p (mode, indices, false))
4057         return false;
4058     }
4059   return true;
4060 }
4061
4062 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4063    functions. Design better to avoid maintenance issues.  */
4064
4065 /* Function vect_model_reduction_cost.
4066
4067    Models cost for a reduction operation, including the vector ops
4068    generated within the strip-mine loop, the initial definition before
4069    the loop, and the epilogue code that must be generated.  */
4070
4071 static void
4072 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4073                            int ncopies)
4074 {
4075   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4076   enum tree_code code;
4077   optab optab;
4078   tree vectype;
4079   gimple *orig_stmt;
4080   machine_mode mode;
4081   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4082   struct loop *loop = NULL;
4083   void *target_cost_data;
4084
4085   if (loop_vinfo)
4086     {
4087       loop = LOOP_VINFO_LOOP (loop_vinfo);
4088       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4089     }
4090   else
4091     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4092
4093   /* Condition reductions generate two reductions in the loop.  */
4094   vect_reduction_type reduction_type
4095     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4096   if (reduction_type == COND_REDUCTION)
4097     ncopies *= 2;
4098
4099   vectype = STMT_VINFO_VECTYPE (stmt_info);
4100   mode = TYPE_MODE (vectype);
4101   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4102
4103   if (!orig_stmt)
4104     orig_stmt = STMT_VINFO_STMT (stmt_info);
4105
4106   code = gimple_assign_rhs_code (orig_stmt);
4107
4108   if (reduction_type == EXTRACT_LAST_REDUCTION
4109       || reduction_type == FOLD_LEFT_REDUCTION)
4110     {
4111       /* No extra instructions needed in the prologue.  */
4112       prologue_cost = 0;
4113
4114       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4115         /* Count one reduction-like operation per vector.  */
4116         inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4117                                      stmt_info, 0, vect_body);
4118       else
4119         {
4120           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4121           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4122           inside_cost = add_stmt_cost (target_cost_data,  nelements,
4123                                        vec_to_scalar, stmt_info, 0,
4124                                        vect_body);
4125           inside_cost += add_stmt_cost (target_cost_data,  nelements,
4126                                         scalar_stmt, stmt_info, 0,
4127                                         vect_body);
4128         }
4129     }
4130   else
4131     {
4132       /* Add in cost for initial definition.
4133          For cond reduction we have four vectors: initial index, step,
4134          initial result of the data reduction, initial value of the index
4135          reduction.  */
4136       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4137       prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4138                                       scalar_to_vec, stmt_info, 0,
4139                                       vect_prologue);
4140
4141       /* Cost of reduction op inside loop.  */
4142       inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4143                                    stmt_info, 0, vect_body);
4144     }
4145
4146   /* Determine cost of epilogue code.
4147
4148      We have a reduction operator that will reduce the vector in one statement.
4149      Also requires scalar extract.  */
4150
4151   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4152     {
4153       if (reduc_fn != IFN_LAST)
4154         {
4155           if (reduction_type == COND_REDUCTION)
4156             {
4157               /* An EQ stmt and an COND_EXPR stmt.  */
4158               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4159                                               vector_stmt, stmt_info, 0,
4160                                               vect_epilogue);
4161               /* Reduction of the max index and a reduction of the found
4162                  values.  */
4163               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4164                                               vec_to_scalar, stmt_info, 0,
4165                                               vect_epilogue);
4166               /* A broadcast of the max value.  */
4167               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4168                                               scalar_to_vec, stmt_info, 0,
4169                                               vect_epilogue);
4170             }
4171           else
4172             {
4173               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4174                                               stmt_info, 0, vect_epilogue);
4175               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4176                                               vec_to_scalar, stmt_info, 0,
4177                                               vect_epilogue);
4178             }
4179         }
4180       else if (reduction_type == COND_REDUCTION)
4181         {
4182           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4183           /* Extraction of scalar elements.  */
4184           epilogue_cost += add_stmt_cost (target_cost_data,
4185                                           2 * estimated_nunits,
4186                                           vec_to_scalar, stmt_info, 0,
4187                                           vect_epilogue);
4188           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4189           epilogue_cost += add_stmt_cost (target_cost_data,
4190                                           2 * estimated_nunits - 3,
4191                                           scalar_stmt, stmt_info, 0,
4192                                           vect_epilogue);
4193         }
4194       else if (reduction_type == EXTRACT_LAST_REDUCTION
4195                || reduction_type == FOLD_LEFT_REDUCTION)
4196         /* No extra instructions need in the epilogue.  */
4197         ;
4198       else
4199         {
4200           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4201           tree bitsize =
4202             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4203           int element_bitsize = tree_to_uhwi (bitsize);
4204           int nelements = vec_size_in_bits / element_bitsize;
4205
4206           if (code == COND_EXPR)
4207             code = MAX_EXPR;
4208
4209           optab = optab_for_tree_code (code, vectype, optab_default);
4210
4211           /* We have a whole vector shift available.  */
4212           if (optab != unknown_optab
4213               && VECTOR_MODE_P (mode)
4214               && optab_handler (optab, mode) != CODE_FOR_nothing
4215               && have_whole_vector_shift (mode))
4216             {
4217               /* Final reduction via vector shifts and the reduction operator.
4218                  Also requires scalar extract.  */
4219               epilogue_cost += add_stmt_cost (target_cost_data,
4220                                               exact_log2 (nelements) * 2,
4221                                               vector_stmt, stmt_info, 0,
4222                                               vect_epilogue);
4223               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4224                                               vec_to_scalar, stmt_info, 0,
4225                                               vect_epilogue);
4226             }
4227           else
4228             /* Use extracts and reduction op for final reduction.  For N
4229                elements, we have N extracts and N-1 reduction ops.  */
4230             epilogue_cost += add_stmt_cost (target_cost_data,
4231                                             nelements + nelements - 1,
4232                                             vector_stmt, stmt_info, 0,
4233                                             vect_epilogue);
4234         }
4235     }
4236
4237   if (dump_enabled_p ())
4238     dump_printf (MSG_NOTE,
4239                  "vect_model_reduction_cost: inside_cost = %d, "
4240                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4241                  prologue_cost, epilogue_cost);
4242 }
4243
4244
4245 /* Function vect_model_induction_cost.
4246
4247    Models cost for induction operations.  */
4248
4249 static void
4250 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4251 {
4252   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4253   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4254   unsigned inside_cost, prologue_cost;
4255
4256   if (PURE_SLP_STMT (stmt_info))
4257     return;
4258
4259   /* loop cost for vec_loop.  */
4260   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4261                                stmt_info, 0, vect_body);
4262
4263   /* prologue cost for vec_init and vec_step.  */
4264   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4265                                  stmt_info, 0, vect_prologue);
4266
4267   if (dump_enabled_p ())
4268     dump_printf_loc (MSG_NOTE, vect_location,
4269                      "vect_model_induction_cost: inside_cost = %d, "
4270                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4271 }
4272
4273
4274
4275 /* Function get_initial_def_for_reduction
4276
4277    Input:
4278    STMT - a stmt that performs a reduction operation in the loop.
4279    INIT_VAL - the initial value of the reduction variable
4280
4281    Output:
4282    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4283         of the reduction (used for adjusting the epilog - see below).
4284    Return a vector variable, initialized according to the operation that STMT
4285         performs. This vector will be used as the initial value of the
4286         vector of partial results.
4287
4288    Option1 (adjust in epilog): Initialize the vector as follows:
4289      add/bit or/xor:    [0,0,...,0,0]
4290      mult/bit and:      [1,1,...,1,1]
4291      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4292    and when necessary (e.g. add/mult case) let the caller know
4293    that it needs to adjust the result by init_val.
4294
4295    Option2: Initialize the vector as follows:
4296      add/bit or/xor:    [init_val,0,0,...,0]
4297      mult/bit and:      [init_val,1,1,...,1]
4298      min/max/cond_expr: [init_val,init_val,...,init_val]
4299    and no adjustments are needed.
4300
4301    For example, for the following code:
4302
4303    s = init_val;
4304    for (i=0;i<n;i++)
4305      s = s + a[i];
4306
4307    STMT is 's = s + a[i]', and the reduction variable is 's'.
4308    For a vector of 4 units, we want to return either [0,0,0,init_val],
4309    or [0,0,0,0] and let the caller know that it needs to adjust
4310    the result at the end by 'init_val'.
4311
4312    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4313    initialization vector is simpler (same element in all entries), if
4314    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4315
4316    A cost model should help decide between these two schemes.  */
4317
4318 tree
4319 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4320                                tree *adjustment_def)
4321 {
4322   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4323   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4324   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4325   tree scalar_type = TREE_TYPE (init_val);
4326   tree vectype = get_vectype_for_scalar_type (scalar_type);
4327   enum tree_code code = gimple_assign_rhs_code (stmt);
4328   tree def_for_init;
4329   tree init_def;
4330   bool nested_in_vect_loop = false;
4331   REAL_VALUE_TYPE real_init_val = dconst0;
4332   int int_init_val = 0;
4333   gimple *def_stmt = NULL;
4334   gimple_seq stmts = NULL;
4335
4336   gcc_assert (vectype);
4337
4338   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4339               || SCALAR_FLOAT_TYPE_P (scalar_type));
4340
4341   if (nested_in_vect_loop_p (loop, stmt))
4342     nested_in_vect_loop = true;
4343   else
4344     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4345
4346   /* In case of double reduction we only create a vector variable to be put
4347      in the reduction phi node.  The actual statement creation is done in
4348      vect_create_epilog_for_reduction.  */
4349   if (adjustment_def && nested_in_vect_loop
4350       && TREE_CODE (init_val) == SSA_NAME
4351       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4352       && gimple_code (def_stmt) == GIMPLE_PHI
4353       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4354       && vinfo_for_stmt (def_stmt)
4355       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4356           == vect_double_reduction_def)
4357     {
4358       *adjustment_def = NULL;
4359       return vect_create_destination_var (init_val, vectype);
4360     }
4361
4362   vect_reduction_type reduction_type
4363     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4364
4365   /* In case of a nested reduction do not use an adjustment def as
4366      that case is not supported by the epilogue generation correctly
4367      if ncopies is not one.  */
4368   if (adjustment_def && nested_in_vect_loop)
4369     {
4370       *adjustment_def = NULL;
4371       return vect_get_vec_def_for_operand (init_val, stmt);
4372     }
4373
4374   switch (code)
4375     {
4376     case WIDEN_SUM_EXPR:
4377     case DOT_PROD_EXPR:
4378     case SAD_EXPR:
4379     case PLUS_EXPR:
4380     case MINUS_EXPR:
4381     case BIT_IOR_EXPR:
4382     case BIT_XOR_EXPR:
4383     case MULT_EXPR:
4384     case BIT_AND_EXPR:
4385       {
4386         /* ADJUSTMENT_DEF is NULL when called from
4387            vect_create_epilog_for_reduction to vectorize double reduction.  */
4388         if (adjustment_def)
4389           *adjustment_def = init_val;
4390
4391         if (code == MULT_EXPR)
4392           {
4393             real_init_val = dconst1;
4394             int_init_val = 1;
4395           }
4396
4397         if (code == BIT_AND_EXPR)
4398           int_init_val = -1;
4399
4400         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4401           def_for_init = build_real (scalar_type, real_init_val);
4402         else
4403           def_for_init = build_int_cst (scalar_type, int_init_val);
4404
4405         if (adjustment_def)
4406           /* Option1: the first element is '0' or '1' as well.  */
4407           init_def = gimple_build_vector_from_val (&stmts, vectype,
4408                                                    def_for_init);
4409         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4410           {
4411             /* Option2 (variable length): the first element is INIT_VAL.  */
4412             init_def = build_vector_from_val (vectype, def_for_init);
4413             gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4414                                                       2, init_def, init_val);
4415             init_def = make_ssa_name (vectype);
4416             gimple_call_set_lhs (call, init_def);
4417             gimple_seq_add_stmt (&stmts, call);
4418           }
4419         else
4420           {
4421             /* Option2: the first element is INIT_VAL.  */
4422             tree_vector_builder elts (vectype, 1, 2);
4423             elts.quick_push (init_val);
4424             elts.quick_push (def_for_init);
4425             init_def = gimple_build_vector (&stmts, &elts);
4426           }
4427       }
4428       break;
4429
4430     case MIN_EXPR:
4431     case MAX_EXPR:
4432     case COND_EXPR:
4433       {
4434         if (adjustment_def)
4435           {
4436             *adjustment_def = NULL_TREE;
4437             if (reduction_type != COND_REDUCTION
4438                 && reduction_type != EXTRACT_LAST_REDUCTION)
4439               {
4440                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4441                 break;
4442               }
4443           }
4444         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4445         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4446       }
4447       break;
4448
4449     default:
4450       gcc_unreachable ();
4451     }
4452
4453   if (stmts)
4454     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4455   return init_def;
4456 }
4457
4458 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4459    NUMBER_OF_VECTORS is the number of vector defs to create.
4460    If NEUTRAL_OP is nonnull, introducing extra elements of that
4461    value will not change the result.  */
4462
4463 static void
4464 get_initial_defs_for_reduction (slp_tree slp_node,
4465                                 vec<tree> *vec_oprnds,
4466                                 unsigned int number_of_vectors,
4467                                 bool reduc_chain, tree neutral_op)
4468 {
4469   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4470   gimple *stmt = stmts[0];
4471   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4472   unsigned HOST_WIDE_INT nunits;
4473   unsigned j, number_of_places_left_in_vector;
4474   tree vector_type;
4475   tree vop;
4476   int group_size = stmts.length ();
4477   unsigned int vec_num, i;
4478   unsigned number_of_copies = 1;
4479   vec<tree> voprnds;
4480   voprnds.create (number_of_vectors);
4481   struct loop *loop;
4482   auto_vec<tree, 16> permute_results;
4483
4484   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4485
4486   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4487
4488   loop = (gimple_bb (stmt))->loop_father;
4489   gcc_assert (loop);
4490   edge pe = loop_preheader_edge (loop);
4491
4492   gcc_assert (!reduc_chain || neutral_op);
4493
4494   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4495      created vectors. It is greater than 1 if unrolling is performed.
4496
4497      For example, we have two scalar operands, s1 and s2 (e.g., group of
4498      strided accesses of size two), while NUNITS is four (i.e., four scalars
4499      of this type can be packed in a vector).  The output vector will contain
4500      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4501      will be 2).
4502
4503      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4504      containing the operands.
4505
4506      For example, NUNITS is four as before, and the group size is 8
4507      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4508      {s5, s6, s7, s8}.  */
4509
4510   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4511     nunits = group_size;
4512
4513   number_of_copies = nunits * number_of_vectors / group_size;
4514
4515   number_of_places_left_in_vector = nunits;
4516   bool constant_p = true;
4517   tree_vector_builder elts (vector_type, nunits, 1);
4518   elts.quick_grow (nunits);
4519   for (j = 0; j < number_of_copies; j++)
4520     {
4521       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4522         {
4523           tree op;
4524           /* Get the def before the loop.  In reduction chain we have only
4525              one initial value.  */
4526           if ((j != (number_of_copies - 1)
4527                || (reduc_chain && i != 0))
4528               && neutral_op)
4529             op = neutral_op;
4530           else
4531             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4532
4533           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4534           number_of_places_left_in_vector--;
4535           elts[number_of_places_left_in_vector] = op;
4536           if (!CONSTANT_CLASS_P (op))
4537             constant_p = false;
4538
4539           if (number_of_places_left_in_vector == 0)
4540             {
4541               gimple_seq ctor_seq = NULL;
4542               tree init;
4543               if (constant_p && !neutral_op
4544                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4545                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4546                 /* Build the vector directly from ELTS.  */
4547                 init = gimple_build_vector (&ctor_seq, &elts);
4548               else if (neutral_op)
4549                 {
4550                   /* Build a vector of the neutral value and shift the
4551                      other elements into place.  */
4552                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4553                                                        neutral_op);
4554                   int k = nunits;
4555                   while (k > 0 && elts[k - 1] == neutral_op)
4556                     k -= 1;
4557                   while (k > 0)
4558                     {
4559                       k -= 1;
4560                       gcall *call = gimple_build_call_internal
4561                         (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4562                       init = make_ssa_name (vector_type);
4563                       gimple_call_set_lhs (call, init);
4564                       gimple_seq_add_stmt (&ctor_seq, call);
4565                     }
4566                 }
4567               else
4568                 {
4569                   /* First time round, duplicate ELTS to fill the
4570                      required number of vectors, then cherry pick the
4571                      appropriate result for each iteration.  */
4572                   if (vec_oprnds->is_empty ())
4573                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4574                                               number_of_vectors,
4575                                               permute_results);
4576                   init = permute_results[number_of_vectors - j - 1];
4577                 }
4578               if (ctor_seq != NULL)
4579                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4580               voprnds.quick_push (init);
4581
4582               number_of_places_left_in_vector = nunits;
4583               elts.new_vector (vector_type, nunits, 1);
4584               elts.quick_grow (nunits);
4585               constant_p = true;
4586             }
4587         }
4588     }
4589
4590   /* Since the vectors are created in the reverse order, we should invert
4591      them.  */
4592   vec_num = voprnds.length ();
4593   for (j = vec_num; j != 0; j--)
4594     {
4595       vop = voprnds[j - 1];
4596       vec_oprnds->quick_push (vop);
4597     }
4598
4599   voprnds.release ();
4600
4601   /* In case that VF is greater than the unrolling factor needed for the SLP
4602      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4603      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4604      to replicate the vectors.  */
4605   tree neutral_vec = NULL;
4606   while (number_of_vectors > vec_oprnds->length ())
4607     {
4608       if (neutral_op)
4609         {
4610           if (!neutral_vec)
4611             {
4612               gimple_seq ctor_seq = NULL;
4613               neutral_vec = gimple_build_vector_from_val
4614                 (&ctor_seq, vector_type, neutral_op);
4615               if (ctor_seq != NULL)
4616                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4617             }
4618           vec_oprnds->quick_push (neutral_vec);
4619         }
4620       else
4621         {
4622           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4623             vec_oprnds->quick_push (vop);
4624         }
4625     }
4626 }
4627
4628
4629 /* Function vect_create_epilog_for_reduction
4630
4631    Create code at the loop-epilog to finalize the result of a reduction
4632    computation.
4633
4634    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4635      reduction statements.
4636    STMT is the scalar reduction stmt that is being vectorized.
4637    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4638      number of elements that we can fit in a vectype (nunits).  In this case
4639      we have to generate more than one vector stmt - i.e - we need to "unroll"
4640      the vector stmt by a factor VF/nunits.  For more details see documentation
4641      in vectorizable_operation.
4642    REDUC_FN is the internal function for the epilog reduction.
4643    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4644      computation.
4645    REDUC_INDEX is the index of the operand in the right hand side of the
4646      statement that is defined by REDUCTION_PHI.
4647    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4648    SLP_NODE is an SLP node containing a group of reduction statements. The
4649      first one in this group is STMT.
4650    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4651      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4652      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4653      any value of the IV in the loop.
4654    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4655    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4656      null if this is not an SLP reduction
4657
4658    This function:
4659    1. Creates the reduction def-use cycles: sets the arguments for
4660       REDUCTION_PHIS:
4661       The loop-entry argument is the vectorized initial-value of the reduction.
4662       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4663       sums.
4664    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4665       by calling the function specified by REDUC_FN if available, or by
4666       other means (whole-vector shifts or a scalar loop).
4667       The function also creates a new phi node at the loop exit to preserve
4668       loop-closed form, as illustrated below.
4669
4670      The flow at the entry to this function:
4671
4672         loop:
4673           vec_def = phi <null, null>            # REDUCTION_PHI
4674           VECT_DEF = vector_stmt                # vectorized form of STMT
4675           s_loop = scalar_stmt                  # (scalar) STMT
4676         loop_exit:
4677           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4678           use <s_out0>
4679           use <s_out0>
4680
4681      The above is transformed by this function into:
4682
4683         loop:
4684           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4685           VECT_DEF = vector_stmt                # vectorized form of STMT
4686           s_loop = scalar_stmt                  # (scalar) STMT
4687         loop_exit:
4688           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4689           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4690           v_out2 = reduce <v_out1>
4691           s_out3 = extract_field <v_out2, 0>
4692           s_out4 = adjust_result <s_out3>
4693           use <s_out4>
4694           use <s_out4>
4695 */
4696
4697 static void
4698 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4699                                   gimple *reduc_def_stmt,
4700                                   int ncopies, internal_fn reduc_fn,
4701                                   vec<gimple *> reduction_phis,
4702                                   bool double_reduc,
4703                                   slp_tree slp_node,
4704                                   slp_instance slp_node_instance,
4705                                   tree induc_val, enum tree_code induc_code,
4706                                   tree neutral_op)
4707 {
4708   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4709   stmt_vec_info prev_phi_info;
4710   tree vectype;
4711   machine_mode mode;
4712   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4713   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4714   basic_block exit_bb;
4715   tree scalar_dest;
4716   tree scalar_type;
4717   gimple *new_phi = NULL, *phi;
4718   gimple_stmt_iterator exit_gsi;
4719   tree vec_dest;
4720   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4721   gimple *epilog_stmt = NULL;
4722   enum tree_code code = gimple_assign_rhs_code (stmt);
4723   gimple *exit_phi;
4724   tree bitsize;
4725   tree adjustment_def = NULL;
4726   tree vec_initial_def = NULL;
4727   tree expr, def, initial_def = NULL;
4728   tree orig_name, scalar_result;
4729   imm_use_iterator imm_iter, phi_imm_iter;
4730   use_operand_p use_p, phi_use_p;
4731   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4732   bool nested_in_vect_loop = false;
4733   auto_vec<gimple *> new_phis;
4734   auto_vec<gimple *> inner_phis;
4735   enum vect_def_type dt = vect_unknown_def_type;
4736   int j, i;
4737   auto_vec<tree> scalar_results;
4738   unsigned int group_size = 1, k, ratio;
4739   auto_vec<tree> vec_initial_defs;
4740   auto_vec<gimple *> phis;
4741   bool slp_reduc = false;
4742   bool direct_slp_reduc;
4743   tree new_phi_result;
4744   gimple *inner_phi = NULL;
4745   tree induction_index = NULL_TREE;
4746
4747   if (slp_node)
4748     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4749
4750   if (nested_in_vect_loop_p (loop, stmt))
4751     {
4752       outer_loop = loop;
4753       loop = loop->inner;
4754       nested_in_vect_loop = true;
4755       gcc_assert (!slp_node);
4756     }
4757
4758   vectype = STMT_VINFO_VECTYPE (stmt_info);
4759   gcc_assert (vectype);
4760   mode = TYPE_MODE (vectype);
4761
4762   /* 1. Create the reduction def-use cycle:
4763      Set the arguments of REDUCTION_PHIS, i.e., transform
4764
4765         loop:
4766           vec_def = phi <null, null>            # REDUCTION_PHI
4767           VECT_DEF = vector_stmt                # vectorized form of STMT
4768           ...
4769
4770      into:
4771
4772         loop:
4773           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4774           VECT_DEF = vector_stmt                # vectorized form of STMT
4775           ...
4776
4777      (in case of SLP, do it for all the phis). */
4778
4779   /* Get the loop-entry arguments.  */
4780   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4781   if (slp_node)
4782     {
4783       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4784       vec_initial_defs.reserve (vec_num);
4785       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4786                                       &vec_initial_defs, vec_num,
4787                                       GROUP_FIRST_ELEMENT (stmt_info),
4788                                       neutral_op);
4789     }
4790   else
4791     {
4792       /* Get at the scalar def before the loop, that defines the initial value
4793          of the reduction variable.  */
4794       gimple *def_stmt;
4795       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4796                                            loop_preheader_edge (loop));
4797       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4798          and we can't use zero for induc_val, use initial_def.  Similarly
4799          for REDUC_MIN and initial_def larger than the base.  */
4800       if (TREE_CODE (initial_def) == INTEGER_CST
4801           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4802               == INTEGER_INDUC_COND_REDUCTION)
4803           && !integer_zerop (induc_val)
4804           && ((induc_code == MAX_EXPR
4805                && tree_int_cst_lt (initial_def, induc_val))
4806               || (induc_code == MIN_EXPR
4807                   && tree_int_cst_lt (induc_val, initial_def))))
4808         induc_val = initial_def;
4809       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4810       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4811                                                        &adjustment_def);
4812       vec_initial_defs.create (1);
4813       vec_initial_defs.quick_push (vec_initial_def);
4814     }
4815
4816   /* Set phi nodes arguments.  */
4817   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4818     {
4819       tree vec_init_def = vec_initial_defs[i];
4820       tree def = vect_defs[i];
4821       for (j = 0; j < ncopies; j++)
4822         {
4823           if (j != 0)
4824             {
4825               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4826               if (nested_in_vect_loop)
4827                 vec_init_def
4828                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4829                                                     vec_init_def);
4830             }
4831
4832           /* Set the loop-entry arg of the reduction-phi.  */
4833
4834           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4835               == INTEGER_INDUC_COND_REDUCTION)
4836             {
4837               /* Initialise the reduction phi to zero.  This prevents initial
4838                  values of non-zero interferring with the reduction op.  */
4839               gcc_assert (ncopies == 1);
4840               gcc_assert (i == 0);
4841
4842               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4843               tree induc_val_vec
4844                 = build_vector_from_val (vec_init_def_type, induc_val);
4845
4846               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4847                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4848             }
4849           else
4850             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4851                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4852
4853           /* Set the loop-latch arg for the reduction-phi.  */
4854           if (j > 0)
4855             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4856
4857           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4858                        UNKNOWN_LOCATION);
4859
4860           if (dump_enabled_p ())
4861             {
4862               dump_printf_loc (MSG_NOTE, vect_location,
4863                                "transform reduction: created def-use cycle: ");
4864               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4865               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4866             }
4867         }
4868     }
4869
4870   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4871      which is updated with the current index of the loop for every match of
4872      the original loop's cond_expr (VEC_STMT).  This results in a vector
4873      containing the last time the condition passed for that vector lane.
4874      The first match will be a 1 to allow 0 to be used for non-matching
4875      indexes.  If there are no matches at all then the vector will be all
4876      zeroes.  */
4877   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4878     {
4879       tree indx_before_incr, indx_after_incr;
4880       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4881
4882       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4883       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4884
4885       int scalar_precision
4886         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4887       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4888       tree cr_index_vector_type = build_vector_type
4889         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4890
4891       /* First we create a simple vector induction variable which starts
4892          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4893          vector size (STEP).  */
4894
4895       /* Create a {1,2,3,...} vector.  */
4896       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4897
4898       /* Create a vector of the step value.  */
4899       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4900       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4901
4902       /* Create an induction variable.  */
4903       gimple_stmt_iterator incr_gsi;
4904       bool insert_after;
4905       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4906       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4907                  insert_after, &indx_before_incr, &indx_after_incr);
4908
4909       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4910          filled with zeros (VEC_ZERO).  */
4911
4912       /* Create a vector of 0s.  */
4913       tree zero = build_zero_cst (cr_index_scalar_type);
4914       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4915
4916       /* Create a vector phi node.  */
4917       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4918       new_phi = create_phi_node (new_phi_tree, loop->header);
4919       set_vinfo_for_stmt (new_phi,
4920                           new_stmt_vec_info (new_phi, loop_vinfo));
4921       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4922                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4923
4924       /* Now take the condition from the loops original cond_expr
4925          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4926          every match uses values from the induction variable
4927          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4928          (NEW_PHI_TREE).
4929          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4930          the new cond_expr (INDEX_COND_EXPR).  */
4931
4932       /* Duplicate the condition from vec_stmt.  */
4933       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4934
4935       /* Create a conditional, where the condition is taken from vec_stmt
4936          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4937          else is the phi (NEW_PHI_TREE).  */
4938       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4939                                      ccompare, indx_before_incr,
4940                                      new_phi_tree);
4941       induction_index = make_ssa_name (cr_index_vector_type);
4942       gimple *index_condition = gimple_build_assign (induction_index,
4943                                                      index_cond_expr);
4944       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4945       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4946                                                         loop_vinfo);
4947       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4948       set_vinfo_for_stmt (index_condition, index_vec_info);
4949
4950       /* Update the phi with the vec cond.  */
4951       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4952                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4953     }
4954
4955   /* 2. Create epilog code.
4956         The reduction epilog code operates across the elements of the vector
4957         of partial results computed by the vectorized loop.
4958         The reduction epilog code consists of:
4959
4960         step 1: compute the scalar result in a vector (v_out2)
4961         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4962         step 3: adjust the scalar result (s_out3) if needed.
4963
4964         Step 1 can be accomplished using one the following three schemes:
4965           (scheme 1) using reduc_fn, if available.
4966           (scheme 2) using whole-vector shifts, if available.
4967           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4968                      combined.
4969
4970           The overall epilog code looks like this:
4971
4972           s_out0 = phi <s_loop>         # original EXIT_PHI
4973           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4974           v_out2 = reduce <v_out1>              # step 1
4975           s_out3 = extract_field <v_out2, 0>    # step 2
4976           s_out4 = adjust_result <s_out3>       # step 3
4977
4978           (step 3 is optional, and steps 1 and 2 may be combined).
4979           Lastly, the uses of s_out0 are replaced by s_out4.  */
4980
4981
4982   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4983          v_out1 = phi <VECT_DEF>
4984          Store them in NEW_PHIS.  */
4985
4986   exit_bb = single_exit (loop)->dest;
4987   prev_phi_info = NULL;
4988   new_phis.create (vect_defs.length ());
4989   FOR_EACH_VEC_ELT (vect_defs, i, def)
4990     {
4991       for (j = 0; j < ncopies; j++)
4992         {
4993           tree new_def = copy_ssa_name (def);
4994           phi = create_phi_node (new_def, exit_bb);
4995           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4996           if (j == 0)
4997             new_phis.quick_push (phi);
4998           else
4999             {
5000               def = vect_get_vec_def_for_stmt_copy (dt, def);
5001               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5002             }
5003
5004           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5005           prev_phi_info = vinfo_for_stmt (phi);
5006         }
5007     }
5008
5009   /* The epilogue is created for the outer-loop, i.e., for the loop being
5010      vectorized.  Create exit phis for the outer loop.  */
5011   if (double_reduc)
5012     {
5013       loop = outer_loop;
5014       exit_bb = single_exit (loop)->dest;
5015       inner_phis.create (vect_defs.length ());
5016       FOR_EACH_VEC_ELT (new_phis, i, phi)
5017         {
5018           tree new_result = copy_ssa_name (PHI_RESULT (phi));
5019           gphi *outer_phi = create_phi_node (new_result, exit_bb);
5020           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5021                            PHI_RESULT (phi));
5022           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5023                                                             loop_vinfo));
5024           inner_phis.quick_push (phi);
5025           new_phis[i] = outer_phi;
5026           prev_phi_info = vinfo_for_stmt (outer_phi);
5027           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5028             {
5029               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5030               new_result = copy_ssa_name (PHI_RESULT (phi));
5031               outer_phi = create_phi_node (new_result, exit_bb);
5032               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5033                                PHI_RESULT (phi));
5034               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5035                                                                 loop_vinfo));
5036               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5037               prev_phi_info = vinfo_for_stmt (outer_phi);
5038             }
5039         }
5040     }
5041
5042   exit_gsi = gsi_after_labels (exit_bb);
5043
5044   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5045          (i.e. when reduc_fn is not available) and in the final adjustment
5046          code (if needed).  Also get the original scalar reduction variable as
5047          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5048          represents a reduction pattern), the tree-code and scalar-def are
5049          taken from the original stmt that the pattern-stmt (STMT) replaces.
5050          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5051          are taken from STMT.  */
5052
5053   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5054   if (!orig_stmt)
5055     {
5056       /* Regular reduction  */
5057       orig_stmt = stmt;
5058     }
5059   else
5060     {
5061       /* Reduction pattern  */
5062       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5063       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5064       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5065     }
5066
5067   code = gimple_assign_rhs_code (orig_stmt);
5068   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5069      partial results are added and not subtracted.  */
5070   if (code == MINUS_EXPR)
5071     code = PLUS_EXPR;
5072
5073   scalar_dest = gimple_assign_lhs (orig_stmt);
5074   scalar_type = TREE_TYPE (scalar_dest);
5075   scalar_results.create (group_size);
5076   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5077   bitsize = TYPE_SIZE (scalar_type);
5078
5079   /* In case this is a reduction in an inner-loop while vectorizing an outer
5080      loop - we don't need to extract a single scalar result at the end of the
5081      inner-loop (unless it is double reduction, i.e., the use of reduction is
5082      outside the outer-loop).  The final vector of partial results will be used
5083      in the vectorized outer-loop, or reduced to a scalar result at the end of
5084      the outer-loop.  */
5085   if (nested_in_vect_loop && !double_reduc)
5086     goto vect_finalize_reduction;
5087
5088   /* SLP reduction without reduction chain, e.g.,
5089      # a1 = phi <a2, a0>
5090      # b1 = phi <b2, b0>
5091      a2 = operation (a1)
5092      b2 = operation (b1)  */
5093   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5094
5095   /* True if we should implement SLP_REDUC using native reduction operations
5096      instead of scalar operations.  */
5097   direct_slp_reduc = (reduc_fn != IFN_LAST
5098                       && slp_reduc
5099                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5100
5101   /* In case of reduction chain, e.g.,
5102      # a1 = phi <a3, a0>
5103      a2 = operation (a1)
5104      a3 = operation (a2),
5105
5106      we may end up with more than one vector result.  Here we reduce them to
5107      one vector.  */
5108   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5109     {
5110       tree first_vect = PHI_RESULT (new_phis[0]);
5111       gassign *new_vec_stmt = NULL;
5112       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5113       for (k = 1; k < new_phis.length (); k++)
5114         {
5115           gimple *next_phi = new_phis[k];
5116           tree second_vect = PHI_RESULT (next_phi);
5117           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5118           new_vec_stmt = gimple_build_assign (tem, code,
5119                                               first_vect, second_vect);
5120           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5121           first_vect = tem;
5122         }
5123
5124       new_phi_result = first_vect;
5125       if (new_vec_stmt)
5126         {
5127           new_phis.truncate (0);
5128           new_phis.safe_push (new_vec_stmt);
5129         }
5130     }
5131   /* Likewise if we couldn't use a single defuse cycle.  */
5132   else if (ncopies > 1)
5133     {
5134       gcc_assert (new_phis.length () == 1);
5135       tree first_vect = PHI_RESULT (new_phis[0]);
5136       gassign *new_vec_stmt = NULL;
5137       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5138       gimple *next_phi = new_phis[0];
5139       for (int k = 1; k < ncopies; ++k)
5140         {
5141           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5142           tree second_vect = PHI_RESULT (next_phi);
5143           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5144           new_vec_stmt = gimple_build_assign (tem, code,
5145                                               first_vect, second_vect);
5146           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5147           first_vect = tem;
5148         }
5149       new_phi_result = first_vect;
5150       new_phis.truncate (0);
5151       new_phis.safe_push (new_vec_stmt);
5152     }
5153   else
5154     new_phi_result = PHI_RESULT (new_phis[0]);
5155
5156   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5157       && reduc_fn != IFN_LAST)
5158     {
5159       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5160          various data values where the condition matched and another vector
5161          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5162          need to extract the last matching index (which will be the index with
5163          highest value) and use this to index into the data vector.
5164          For the case where there were no matches, the data vector will contain
5165          all default values and the index vector will be all zeros.  */
5166
5167       /* Get various versions of the type of the vector of indexes.  */
5168       tree index_vec_type = TREE_TYPE (induction_index);
5169       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5170       tree index_scalar_type = TREE_TYPE (index_vec_type);
5171       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5172         (index_vec_type);
5173
5174       /* Get an unsigned integer version of the type of the data vector.  */
5175       int scalar_precision
5176         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5177       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5178       tree vectype_unsigned = build_vector_type
5179         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5180
5181       /* First we need to create a vector (ZERO_VEC) of zeros and another
5182          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5183          can create using a MAX reduction and then expanding.
5184          In the case where the loop never made any matches, the max index will
5185          be zero.  */
5186
5187       /* Vector of {0, 0, 0,...}.  */
5188       tree zero_vec = make_ssa_name (vectype);
5189       tree zero_vec_rhs = build_zero_cst (vectype);
5190       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5191       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5192
5193       /* Find maximum value from the vector of found indexes.  */
5194       tree max_index = make_ssa_name (index_scalar_type);
5195       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5196                                                           1, induction_index);
5197       gimple_call_set_lhs (max_index_stmt, max_index);
5198       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5199
5200       /* Vector of {max_index, max_index, max_index,...}.  */
5201       tree max_index_vec = make_ssa_name (index_vec_type);
5202       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5203                                                       max_index);
5204       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5205                                                         max_index_vec_rhs);
5206       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5207
5208       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5209          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5210          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5211          otherwise.  Only one value should match, resulting in a vector
5212          (VEC_COND) with one data value and the rest zeros.
5213          In the case where the loop never made any matches, every index will
5214          match, resulting in a vector with all data values (which will all be
5215          the default value).  */
5216
5217       /* Compare the max index vector to the vector of found indexes to find
5218          the position of the max value.  */
5219       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5220       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5221                                                       induction_index,
5222                                                       max_index_vec);
5223       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5224
5225       /* Use the compare to choose either values from the data vector or
5226          zero.  */
5227       tree vec_cond = make_ssa_name (vectype);
5228       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5229                                                    vec_compare, new_phi_result,
5230                                                    zero_vec);
5231       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5232
5233       /* Finally we need to extract the data value from the vector (VEC_COND)
5234          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5235          reduction, but because this doesn't exist, we can use a MAX reduction
5236          instead.  The data value might be signed or a float so we need to cast
5237          it first.
5238          In the case where the loop never made any matches, the data values are
5239          all identical, and so will reduce down correctly.  */
5240
5241       /* Make the matched data values unsigned.  */
5242       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5243       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5244                                        vec_cond);
5245       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5246                                                         VIEW_CONVERT_EXPR,
5247                                                         vec_cond_cast_rhs);
5248       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5249
5250       /* Reduce down to a scalar value.  */
5251       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5252       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5253                                                            1, vec_cond_cast);
5254       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5255       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5256
5257       /* Convert the reduced value back to the result type and set as the
5258          result.  */
5259       gimple_seq stmts = NULL;
5260       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5261                                data_reduc);
5262       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5263       scalar_results.safe_push (new_temp);
5264     }
5265   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5266            && reduc_fn == IFN_LAST)
5267     {
5268       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5269          idx = 0;
5270          idx_val = induction_index[0];
5271          val = data_reduc[0];
5272          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5273            if (induction_index[i] > idx_val)
5274              val = data_reduc[i], idx_val = induction_index[i];
5275          return val;  */
5276
5277       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5278       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5279       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5280       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5281       /* Enforced by vectorizable_reduction, which ensures we have target
5282          support before allowing a conditional reduction on variable-length
5283          vectors.  */
5284       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5285       tree idx_val = NULL_TREE, val = NULL_TREE;
5286       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5287         {
5288           tree old_idx_val = idx_val;
5289           tree old_val = val;
5290           idx_val = make_ssa_name (idx_eltype);
5291           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5292                                              build3 (BIT_FIELD_REF, idx_eltype,
5293                                                      induction_index,
5294                                                      bitsize_int (el_size),
5295                                                      bitsize_int (off)));
5296           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5297           val = make_ssa_name (data_eltype);
5298           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5299                                              build3 (BIT_FIELD_REF,
5300                                                      data_eltype,
5301                                                      new_phi_result,
5302                                                      bitsize_int (el_size),
5303                                                      bitsize_int (off)));
5304           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5305           if (off != 0)
5306             {
5307               tree new_idx_val = idx_val;
5308               tree new_val = val;
5309               if (off != v_size - el_size)
5310                 {
5311                   new_idx_val = make_ssa_name (idx_eltype);
5312                   epilog_stmt = gimple_build_assign (new_idx_val,
5313                                                      MAX_EXPR, idx_val,
5314                                                      old_idx_val);
5315                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5316                 }
5317               new_val = make_ssa_name (data_eltype);
5318               epilog_stmt = gimple_build_assign (new_val,
5319                                                  COND_EXPR,
5320                                                  build2 (GT_EXPR,
5321                                                          boolean_type_node,
5322                                                          idx_val,
5323                                                          old_idx_val),
5324                                                  val, old_val);
5325               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5326               idx_val = new_idx_val;
5327               val = new_val;
5328             }
5329         }
5330       /* Convert the reduced value back to the result type and set as the
5331          result.  */
5332       gimple_seq stmts = NULL;
5333       val = gimple_convert (&stmts, scalar_type, val);
5334       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5335       scalar_results.safe_push (val);
5336     }
5337
5338   /* 2.3 Create the reduction code, using one of the three schemes described
5339          above. In SLP we simply need to extract all the elements from the
5340          vector (without reducing them), so we use scalar shifts.  */
5341   else if (reduc_fn != IFN_LAST && !slp_reduc)
5342     {
5343       tree tmp;
5344       tree vec_elem_type;
5345
5346       /* Case 1:  Create:
5347          v_out2 = reduc_expr <v_out1>  */
5348
5349       if (dump_enabled_p ())
5350         dump_printf_loc (MSG_NOTE, vect_location,
5351                          "Reduce using direct vector reduction.\n");
5352
5353       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5354       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5355         {
5356           tree tmp_dest
5357             = vect_create_destination_var (scalar_dest, vec_elem_type);
5358           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5359                                                     new_phi_result);
5360           gimple_set_lhs (epilog_stmt, tmp_dest);
5361           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5362           gimple_set_lhs (epilog_stmt, new_temp);
5363           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5364
5365           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5366                                              new_temp);
5367         }
5368       else
5369         {
5370           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5371                                                     new_phi_result);
5372           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5373         }
5374
5375       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5376       gimple_set_lhs (epilog_stmt, new_temp);
5377       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5378
5379       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5380            == INTEGER_INDUC_COND_REDUCTION)
5381           && !operand_equal_p (initial_def, induc_val, 0))
5382         {
5383           /* Earlier we set the initial value to be a vector if induc_val
5384              values.  Check the result and if it is induc_val then replace
5385              with the original initial value, unless induc_val is
5386              the same as initial_def already.  */
5387           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5388                                   induc_val);
5389
5390           tmp = make_ssa_name (new_scalar_dest);
5391           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5392                                              initial_def, new_temp);
5393           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5394           new_temp = tmp;
5395         }
5396
5397       scalar_results.safe_push (new_temp);
5398     }
5399   else if (direct_slp_reduc)
5400     {
5401       /* Here we create one vector for each of the GROUP_SIZE results,
5402          with the elements for other SLP statements replaced with the
5403          neutral value.  We can then do a normal reduction on each vector.  */
5404
5405       /* Enforced by vectorizable_reduction.  */
5406       gcc_assert (new_phis.length () == 1);
5407       gcc_assert (pow2p_hwi (group_size));
5408
5409       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5410       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5411       gimple_seq seq = NULL;
5412
5413       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5414          and the same element size as VECTYPE.  */
5415       tree index = build_index_vector (vectype, 0, 1);
5416       tree index_type = TREE_TYPE (index);
5417       tree index_elt_type = TREE_TYPE (index_type);
5418       tree mask_type = build_same_sized_truth_vector_type (index_type);
5419
5420       /* Create a vector that, for each element, identifies which of
5421          the GROUP_SIZE results should use it.  */
5422       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5423       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5424                             build_vector_from_val (index_type, index_mask));
5425
5426       /* Get a neutral vector value.  This is simply a splat of the neutral
5427          scalar value if we have one, otherwise the initial scalar value
5428          is itself a neutral value.  */
5429       tree vector_identity = NULL_TREE;
5430       if (neutral_op)
5431         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5432                                                         neutral_op);
5433       for (unsigned int i = 0; i < group_size; ++i)
5434         {
5435           /* If there's no univeral neutral value, we can use the
5436              initial scalar value from the original PHI.  This is used
5437              for MIN and MAX reduction, for example.  */
5438           if (!neutral_op)
5439             {
5440               tree scalar_value
5441                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5442                                          loop_preheader_edge (loop));
5443               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5444                                                               scalar_value);
5445             }
5446
5447           /* Calculate the equivalent of:
5448
5449              sel[j] = (index[j] == i);
5450
5451              which selects the elements of NEW_PHI_RESULT that should
5452              be included in the result.  */
5453           tree compare_val = build_int_cst (index_elt_type, i);
5454           compare_val = build_vector_from_val (index_type, compare_val);
5455           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5456                                    index, compare_val);
5457
5458           /* Calculate the equivalent of:
5459
5460              vec = seq ? new_phi_result : vector_identity;
5461
5462              VEC is now suitable for a full vector reduction.  */
5463           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5464                                    sel, new_phi_result, vector_identity);
5465
5466           /* Do the reduction and convert it to the appropriate type.  */
5467           gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5468           tree scalar = make_ssa_name (TREE_TYPE (vectype));
5469           gimple_call_set_lhs (call, scalar);
5470           gimple_seq_add_stmt (&seq, call);
5471           scalar = gimple_convert (&seq, scalar_type, scalar);
5472           scalar_results.safe_push (scalar);
5473         }
5474       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5475     }
5476   else
5477     {
5478       bool reduce_with_shift;
5479       tree vec_temp;
5480
5481       /* COND reductions all do the final reduction with MAX_EXPR
5482          or MIN_EXPR.  */
5483       if (code == COND_EXPR)
5484         {
5485           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5486               == INTEGER_INDUC_COND_REDUCTION)
5487             code = induc_code;
5488           else
5489             code = MAX_EXPR;
5490         }
5491
5492       /* See if the target wants to do the final (shift) reduction
5493          in a vector mode of smaller size and first reduce upper/lower
5494          halves against each other.  */
5495       enum machine_mode mode1 = mode;
5496       tree vectype1 = vectype;
5497       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5498       unsigned sz1 = sz;
5499       if (!slp_reduc
5500           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5501         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5502
5503       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5504       reduce_with_shift = have_whole_vector_shift (mode1);
5505       if (!VECTOR_MODE_P (mode1))
5506         reduce_with_shift = false;
5507       else
5508         {
5509           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5510           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5511             reduce_with_shift = false;
5512         }
5513
5514       /* First reduce the vector to the desired vector size we should
5515          do shift reduction on by combining upper and lower halves.  */
5516       new_temp = new_phi_result;
5517       while (sz > sz1)
5518         {
5519           gcc_assert (!slp_reduc);
5520           sz /= 2;
5521           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5522
5523           /* The target has to make sure we support lowpart/highpart
5524              extraction, either via direct vector extract or through
5525              an integer mode punning.  */
5526           tree dst1, dst2;
5527           if (convert_optab_handler (vec_extract_optab,
5528                                      TYPE_MODE (TREE_TYPE (new_temp)),
5529                                      TYPE_MODE (vectype1))
5530               != CODE_FOR_nothing)
5531             {
5532               /* Extract sub-vectors directly once vec_extract becomes
5533                  a conversion optab.  */
5534               dst1 = make_ssa_name (vectype1);
5535               epilog_stmt
5536                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5537                                          build3 (BIT_FIELD_REF, vectype1,
5538                                                  new_temp, TYPE_SIZE (vectype1),
5539                                                  bitsize_int (0)));
5540               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5541               dst2 =  make_ssa_name (vectype1);
5542               epilog_stmt
5543                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5544                                          build3 (BIT_FIELD_REF, vectype1,
5545                                                  new_temp, TYPE_SIZE (vectype1),
5546                                                  bitsize_int (sz * BITS_PER_UNIT)));
5547               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5548             }
5549           else
5550             {
5551               /* Extract via punning to appropriately sized integer mode
5552                  vector.  */
5553               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5554                                                             1);
5555               tree etype = build_vector_type (eltype, 2);
5556               gcc_assert (convert_optab_handler (vec_extract_optab,
5557                                                  TYPE_MODE (etype),
5558                                                  TYPE_MODE (eltype))
5559                           != CODE_FOR_nothing);
5560               tree tem = make_ssa_name (etype);
5561               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5562                                                  build1 (VIEW_CONVERT_EXPR,
5563                                                          etype, new_temp));
5564               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5565               new_temp = tem;
5566               tem = make_ssa_name (eltype);
5567               epilog_stmt
5568                   = gimple_build_assign (tem, BIT_FIELD_REF,
5569                                          build3 (BIT_FIELD_REF, eltype,
5570                                                  new_temp, TYPE_SIZE (eltype),
5571                                                  bitsize_int (0)));
5572               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5573               dst1 = make_ssa_name (vectype1);
5574               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5575                                                  build1 (VIEW_CONVERT_EXPR,
5576                                                          vectype1, tem));
5577               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5578               tem = make_ssa_name (eltype);
5579               epilog_stmt
5580                   = gimple_build_assign (tem, BIT_FIELD_REF,
5581                                          build3 (BIT_FIELD_REF, eltype,
5582                                                  new_temp, TYPE_SIZE (eltype),
5583                                                  bitsize_int (sz * BITS_PER_UNIT)));
5584               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5585               dst2 =  make_ssa_name (vectype1);
5586               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5587                                                  build1 (VIEW_CONVERT_EXPR,
5588                                                          vectype1, tem));
5589               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5590             }
5591
5592           new_temp = make_ssa_name (vectype1);
5593           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5594           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5595         }
5596
5597       if (reduce_with_shift && !slp_reduc)
5598         {
5599           int element_bitsize = tree_to_uhwi (bitsize);
5600           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5601              for variable-length vectors and also requires direct target support
5602              for loop reductions.  */
5603           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5604           int nelements = vec_size_in_bits / element_bitsize;
5605           vec_perm_builder sel;
5606           vec_perm_indices indices;
5607
5608           int elt_offset;
5609
5610           tree zero_vec = build_zero_cst (vectype1);
5611           /* Case 2: Create:
5612              for (offset = nelements/2; offset >= 1; offset/=2)
5613                 {
5614                   Create:  va' = vec_shift <va, offset>
5615                   Create:  va = vop <va, va'>
5616                 }  */
5617
5618           tree rhs;
5619
5620           if (dump_enabled_p ())
5621             dump_printf_loc (MSG_NOTE, vect_location,
5622                              "Reduce using vector shifts\n");
5623
5624           mode1 = TYPE_MODE (vectype1);
5625           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5626           for (elt_offset = nelements / 2;
5627                elt_offset >= 1;
5628                elt_offset /= 2)
5629             {
5630               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5631               indices.new_vector (sel, 2, nelements);
5632               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5633               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5634                                                  new_temp, zero_vec, mask);
5635               new_name = make_ssa_name (vec_dest, epilog_stmt);
5636               gimple_assign_set_lhs (epilog_stmt, new_name);
5637               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5638
5639               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5640                                                  new_temp);
5641               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5642               gimple_assign_set_lhs (epilog_stmt, new_temp);
5643               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5644             }
5645
5646           /* 2.4  Extract the final scalar result.  Create:
5647              s_out3 = extract_field <v_out2, bitpos>  */
5648
5649           if (dump_enabled_p ())
5650             dump_printf_loc (MSG_NOTE, vect_location,
5651                              "extract scalar result\n");
5652
5653           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5654                         bitsize, bitsize_zero_node);
5655           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5656           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5657           gimple_assign_set_lhs (epilog_stmt, new_temp);
5658           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5659           scalar_results.safe_push (new_temp);
5660         }
5661       else
5662         {
5663           /* Case 3: Create:
5664              s = extract_field <v_out2, 0>
5665              for (offset = element_size;
5666                   offset < vector_size;
5667                   offset += element_size;)
5668                {
5669                  Create:  s' = extract_field <v_out2, offset>
5670                  Create:  s = op <s, s'>  // For non SLP cases
5671                }  */
5672
5673           if (dump_enabled_p ())
5674             dump_printf_loc (MSG_NOTE, vect_location,
5675                              "Reduce using scalar code.\n");
5676
5677           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5678           int element_bitsize = tree_to_uhwi (bitsize);
5679           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5680             {
5681               int bit_offset;
5682               if (gimple_code (new_phi) == GIMPLE_PHI)
5683                 vec_temp = PHI_RESULT (new_phi);
5684               else
5685                 vec_temp = gimple_assign_lhs (new_phi);
5686               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5687                                  bitsize_zero_node);
5688               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5689               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5690               gimple_assign_set_lhs (epilog_stmt, new_temp);
5691               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5692
5693               /* In SLP we don't need to apply reduction operation, so we just
5694                  collect s' values in SCALAR_RESULTS.  */
5695               if (slp_reduc)
5696                 scalar_results.safe_push (new_temp);
5697
5698               for (bit_offset = element_bitsize;
5699                    bit_offset < vec_size_in_bits;
5700                    bit_offset += element_bitsize)
5701                 {
5702                   tree bitpos = bitsize_int (bit_offset);
5703                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5704                                      bitsize, bitpos);
5705
5706                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5707                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5708                   gimple_assign_set_lhs (epilog_stmt, new_name);
5709                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5710
5711                   if (slp_reduc)
5712                     {
5713                       /* In SLP we don't need to apply reduction operation, so
5714                          we just collect s' values in SCALAR_RESULTS.  */
5715                       new_temp = new_name;
5716                       scalar_results.safe_push (new_name);
5717                     }
5718                   else
5719                     {
5720                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5721                                                          new_name, new_temp);
5722                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5723                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5724                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5725                     }
5726                 }
5727             }
5728
5729           /* The only case where we need to reduce scalar results in SLP, is
5730              unrolling.  If the size of SCALAR_RESULTS is greater than
5731              GROUP_SIZE, we reduce them combining elements modulo
5732              GROUP_SIZE.  */
5733           if (slp_reduc)
5734             {
5735               tree res, first_res, new_res;
5736               gimple *new_stmt;
5737
5738               /* Reduce multiple scalar results in case of SLP unrolling.  */
5739               for (j = group_size; scalar_results.iterate (j, &res);
5740                    j++)
5741                 {
5742                   first_res = scalar_results[j % group_size];
5743                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5744                                                   first_res, res);
5745                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5746                   gimple_assign_set_lhs (new_stmt, new_res);
5747                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5748                   scalar_results[j % group_size] = new_res;
5749                 }
5750             }
5751           else
5752             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5753             scalar_results.safe_push (new_temp);
5754         }
5755
5756       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5757            == INTEGER_INDUC_COND_REDUCTION)
5758           && !operand_equal_p (initial_def, induc_val, 0))
5759         {
5760           /* Earlier we set the initial value to be a vector if induc_val
5761              values.  Check the result and if it is induc_val then replace
5762              with the original initial value, unless induc_val is
5763              the same as initial_def already.  */
5764           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5765                                   induc_val);
5766
5767           tree tmp = make_ssa_name (new_scalar_dest);
5768           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5769                                              initial_def, new_temp);
5770           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5771           scalar_results[0] = tmp;
5772         }
5773     }
5774
5775 vect_finalize_reduction:
5776
5777   if (double_reduc)
5778     loop = loop->inner;
5779
5780   /* 2.5 Adjust the final result by the initial value of the reduction
5781          variable. (When such adjustment is not needed, then
5782          'adjustment_def' is zero).  For example, if code is PLUS we create:
5783          new_temp = loop_exit_def + adjustment_def  */
5784
5785   if (adjustment_def)
5786     {
5787       gcc_assert (!slp_reduc);
5788       if (nested_in_vect_loop)
5789         {
5790           new_phi = new_phis[0];
5791           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5792           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5793           new_dest = vect_create_destination_var (scalar_dest, vectype);
5794         }
5795       else
5796         {
5797           new_temp = scalar_results[0];
5798           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5799           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5800           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5801         }
5802
5803       epilog_stmt = gimple_build_assign (new_dest, expr);
5804       new_temp = make_ssa_name (new_dest, epilog_stmt);
5805       gimple_assign_set_lhs (epilog_stmt, new_temp);
5806       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5807       if (nested_in_vect_loop)
5808         {
5809           set_vinfo_for_stmt (epilog_stmt,
5810                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5811           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5812                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5813
5814           if (!double_reduc)
5815             scalar_results.quick_push (new_temp);
5816           else
5817             scalar_results[0] = new_temp;
5818         }
5819       else
5820         scalar_results[0] = new_temp;
5821
5822       new_phis[0] = epilog_stmt;
5823     }
5824
5825   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5826           phis with new adjusted scalar results, i.e., replace use <s_out0>
5827           with use <s_out4>.
5828
5829      Transform:
5830         loop_exit:
5831           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5832           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5833           v_out2 = reduce <v_out1>
5834           s_out3 = extract_field <v_out2, 0>
5835           s_out4 = adjust_result <s_out3>
5836           use <s_out0>
5837           use <s_out0>
5838
5839      into:
5840
5841         loop_exit:
5842           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5843           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5844           v_out2 = reduce <v_out1>
5845           s_out3 = extract_field <v_out2, 0>
5846           s_out4 = adjust_result <s_out3>
5847           use <s_out4>
5848           use <s_out4> */
5849
5850
5851   /* In SLP reduction chain we reduce vector results into one vector if
5852      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5853      the last stmt in the reduction chain, since we are looking for the loop
5854      exit phi node.  */
5855   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5856     {
5857       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5858       /* Handle reduction patterns.  */
5859       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5860         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5861
5862       scalar_dest = gimple_assign_lhs (dest_stmt);
5863       group_size = 1;
5864     }
5865
5866   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5867      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5868      need to match SCALAR_RESULTS with corresponding statements.  The first
5869      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5870      the first vector stmt, etc.
5871      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5872   if (group_size > new_phis.length ())
5873     {
5874       ratio = group_size / new_phis.length ();
5875       gcc_assert (!(group_size % new_phis.length ()));
5876     }
5877   else
5878     ratio = 1;
5879
5880   for (k = 0; k < group_size; k++)
5881     {
5882       if (k % ratio == 0)
5883         {
5884           epilog_stmt = new_phis[k / ratio];
5885           reduction_phi = reduction_phis[k / ratio];
5886           if (double_reduc)
5887             inner_phi = inner_phis[k / ratio];
5888         }
5889
5890       if (slp_reduc)
5891         {
5892           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5893
5894           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5895           /* SLP statements can't participate in patterns.  */
5896           gcc_assert (!orig_stmt);
5897           scalar_dest = gimple_assign_lhs (current_stmt);
5898         }
5899
5900       phis.create (3);
5901       /* Find the loop-closed-use at the loop exit of the original scalar
5902          result.  (The reduction result is expected to have two immediate uses -
5903          one at the latch block, and one at the loop exit).  */
5904       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5905         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5906             && !is_gimple_debug (USE_STMT (use_p)))
5907           phis.safe_push (USE_STMT (use_p));
5908
5909       /* While we expect to have found an exit_phi because of loop-closed-ssa
5910          form we can end up without one if the scalar cycle is dead.  */
5911
5912       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5913         {
5914           if (outer_loop)
5915             {
5916               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5917               gphi *vect_phi;
5918
5919               /* FORNOW. Currently not supporting the case that an inner-loop
5920                  reduction is not used in the outer-loop (but only outside the
5921                  outer-loop), unless it is double reduction.  */
5922               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5923                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5924                           || double_reduc);
5925
5926               if (double_reduc)
5927                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5928               else
5929                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5930               if (!double_reduc
5931                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5932                       != vect_double_reduction_def)
5933                 continue;
5934
5935               /* Handle double reduction:
5936
5937                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5938                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5939                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5940                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5941
5942                  At that point the regular reduction (stmt2 and stmt3) is
5943                  already vectorized, as well as the exit phi node, stmt4.
5944                  Here we vectorize the phi node of double reduction, stmt1, and
5945                  update all relevant statements.  */
5946
5947               /* Go through all the uses of s2 to find double reduction phi
5948                  node, i.e., stmt1 above.  */
5949               orig_name = PHI_RESULT (exit_phi);
5950               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5951                 {
5952                   stmt_vec_info use_stmt_vinfo;
5953                   stmt_vec_info new_phi_vinfo;
5954                   tree vect_phi_init, preheader_arg, vect_phi_res;
5955                   basic_block bb = gimple_bb (use_stmt);
5956                   gimple *use;
5957
5958                   /* Check that USE_STMT is really double reduction phi
5959                      node.  */
5960                   if (gimple_code (use_stmt) != GIMPLE_PHI
5961                       || gimple_phi_num_args (use_stmt) != 2
5962                       || bb->loop_father != outer_loop)
5963                     continue;
5964                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5965                   if (!use_stmt_vinfo
5966                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5967                           != vect_double_reduction_def)
5968                     continue;
5969
5970                   /* Create vector phi node for double reduction:
5971                      vs1 = phi <vs0, vs2>
5972                      vs1 was created previously in this function by a call to
5973                        vect_get_vec_def_for_operand and is stored in
5974                        vec_initial_def;
5975                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5976                      vs0 is created here.  */
5977
5978                   /* Create vector phi node.  */
5979                   vect_phi = create_phi_node (vec_initial_def, bb);
5980                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5981                                     loop_vec_info_for_loop (outer_loop));
5982                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5983
5984                   /* Create vs0 - initial def of the double reduction phi.  */
5985                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5986                                              loop_preheader_edge (outer_loop));
5987                   vect_phi_init = get_initial_def_for_reduction
5988                     (stmt, preheader_arg, NULL);
5989
5990                   /* Update phi node arguments with vs0 and vs2.  */
5991                   add_phi_arg (vect_phi, vect_phi_init,
5992                                loop_preheader_edge (outer_loop),
5993                                UNKNOWN_LOCATION);
5994                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5995                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5996                   if (dump_enabled_p ())
5997                     {
5998                       dump_printf_loc (MSG_NOTE, vect_location,
5999                                        "created double reduction phi node: ");
6000                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6001                     }
6002
6003                   vect_phi_res = PHI_RESULT (vect_phi);
6004
6005                   /* Replace the use, i.e., set the correct vs1 in the regular
6006                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
6007                      loop is redundant.  */
6008                   use = reduction_phi;
6009                   for (j = 0; j < ncopies; j++)
6010                     {
6011                       edge pr_edge = loop_preheader_edge (loop);
6012                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6013                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6014                     }
6015                 }
6016             }
6017         }
6018
6019       phis.release ();
6020       if (nested_in_vect_loop)
6021         {
6022           if (double_reduc)
6023             loop = outer_loop;
6024           else
6025             continue;
6026         }
6027
6028       phis.create (3);
6029       /* Find the loop-closed-use at the loop exit of the original scalar
6030          result.  (The reduction result is expected to have two immediate uses,
6031          one at the latch block, and one at the loop exit).  For double
6032          reductions we are looking for exit phis of the outer loop.  */
6033       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6034         {
6035           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6036             {
6037               if (!is_gimple_debug (USE_STMT (use_p)))
6038                 phis.safe_push (USE_STMT (use_p));
6039             }
6040           else
6041             {
6042               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6043                 {
6044                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6045
6046                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6047                     {
6048                       if (!flow_bb_inside_loop_p (loop,
6049                                              gimple_bb (USE_STMT (phi_use_p)))
6050                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6051                         phis.safe_push (USE_STMT (phi_use_p));
6052                     }
6053                 }
6054             }
6055         }
6056
6057       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6058         {
6059           /* Replace the uses:  */
6060           orig_name = PHI_RESULT (exit_phi);
6061           scalar_result = scalar_results[k];
6062           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6063             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6064               SET_USE (use_p, scalar_result);
6065         }
6066
6067       phis.release ();
6068     }
6069 }
6070
6071 /* Return a vector of type VECTYPE that is equal to the vector select
6072    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6073    before GSI.  */
6074
6075 static tree
6076 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6077                      tree vec, tree identity)
6078 {
6079   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6080   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6081                                           mask, vec, identity);
6082   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6083   return cond;
6084 }
6085
6086 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6087    order, starting with LHS.  Insert the extraction statements before GSI and
6088    associate the new scalar SSA names with variable SCALAR_DEST.
6089    Return the SSA name for the result.  */
6090
6091 static tree
6092 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6093                        tree_code code, tree lhs, tree vector_rhs)
6094 {
6095   tree vectype = TREE_TYPE (vector_rhs);
6096   tree scalar_type = TREE_TYPE (vectype);
6097   tree bitsize = TYPE_SIZE (scalar_type);
6098   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6099   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6100
6101   for (unsigned HOST_WIDE_INT bit_offset = 0;
6102        bit_offset < vec_size_in_bits;
6103        bit_offset += element_bitsize)
6104     {
6105       tree bitpos = bitsize_int (bit_offset);
6106       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6107                          bitsize, bitpos);
6108
6109       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6110       rhs = make_ssa_name (scalar_dest, stmt);
6111       gimple_assign_set_lhs (stmt, rhs);
6112       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6113
6114       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6115       tree new_name = make_ssa_name (scalar_dest, stmt);
6116       gimple_assign_set_lhs (stmt, new_name);
6117       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6118       lhs = new_name;
6119     }
6120   return lhs;
6121 }
6122
6123 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
6124    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6125    statement.  CODE is the operation performed by STMT and OPS are
6126    its scalar operands.  REDUC_INDEX is the index of the operand in
6127    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6128    implements in-order reduction, or IFN_LAST if we should open-code it.
6129    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6130    that should be used to control the operation in a fully-masked loop.  */
6131
6132 static bool
6133 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6134                                gimple **vec_stmt, slp_tree slp_node,
6135                                gimple *reduc_def_stmt,
6136                                tree_code code, internal_fn reduc_fn,
6137                                tree ops[3], tree vectype_in,
6138                                int reduc_index, vec_loop_masks *masks)
6139 {
6140   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6141   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6142   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6143   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6144   gimple *new_stmt = NULL;
6145
6146   int ncopies;
6147   if (slp_node)
6148     ncopies = 1;
6149   else
6150     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6151
6152   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6153   gcc_assert (ncopies == 1);
6154   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6155   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6156   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6157               == FOLD_LEFT_REDUCTION);
6158
6159   if (slp_node)
6160     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6161                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6162
6163   tree op0 = ops[1 - reduc_index];
6164
6165   int group_size = 1;
6166   gimple *scalar_dest_def;
6167   auto_vec<tree> vec_oprnds0;
6168   if (slp_node)
6169     {
6170       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6171       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6172       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6173     }
6174   else
6175     {
6176       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6177       vec_oprnds0.create (1);
6178       vec_oprnds0.quick_push (loop_vec_def0);
6179       scalar_dest_def = stmt;
6180     }
6181
6182   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6183   tree scalar_type = TREE_TYPE (scalar_dest);
6184   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6185
6186   int vec_num = vec_oprnds0.length ();
6187   gcc_assert (vec_num == 1 || slp_node);
6188   tree vec_elem_type = TREE_TYPE (vectype_out);
6189   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6190
6191   tree vector_identity = NULL_TREE;
6192   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6193     vector_identity = build_zero_cst (vectype_out);
6194
6195   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6196   int i;
6197   tree def0;
6198   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6199     {
6200       tree mask = NULL_TREE;
6201       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6202         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6203
6204       /* Handle MINUS by adding the negative.  */
6205       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6206         {
6207           tree negated = make_ssa_name (vectype_out);
6208           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6209           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6210           def0 = negated;
6211         }
6212
6213       if (mask)
6214         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6215                                     vector_identity);
6216
6217       /* On the first iteration the input is simply the scalar phi
6218          result, and for subsequent iterations it is the output of
6219          the preceding operation.  */
6220       if (reduc_fn != IFN_LAST)
6221         {
6222           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6223           /* For chained SLP reductions the output of the previous reduction
6224              operation serves as the input of the next. For the final statement
6225              the output cannot be a temporary - we reuse the original
6226              scalar destination of the last statement.  */
6227           if (i != vec_num - 1)
6228             {
6229               gimple_set_lhs (new_stmt, scalar_dest_var);
6230               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6231               gimple_set_lhs (new_stmt, reduc_var);
6232             }
6233         }
6234       else
6235         {
6236           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6237                                              reduc_var, def0);
6238           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6239           /* Remove the statement, so that we can use the same code paths
6240              as for statements that we've just created.  */
6241           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6242           gsi_remove (&tmp_gsi, false);
6243         }
6244
6245       if (i == vec_num - 1)
6246         {
6247           gimple_set_lhs (new_stmt, scalar_dest);
6248           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6249         }
6250       else
6251         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6252
6253       if (slp_node)
6254         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6255     }
6256
6257   if (!slp_node)
6258     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6259
6260   return true;
6261 }
6262
6263 /* Function is_nonwrapping_integer_induction.
6264
6265    Check if STMT (which is part of loop LOOP) both increments and
6266    does not cause overflow.  */
6267
6268 static bool
6269 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6270 {
6271   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6272   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6273   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6274   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6275   widest_int ni, max_loop_value, lhs_max;
6276   bool overflow = false;
6277
6278   /* Make sure the loop is integer based.  */
6279   if (TREE_CODE (base) != INTEGER_CST
6280       || TREE_CODE (step) != INTEGER_CST)
6281     return false;
6282
6283   /* Check that the max size of the loop will not wrap.  */
6284
6285   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6286     return true;
6287
6288   if (! max_stmt_executions (loop, &ni))
6289     return false;
6290
6291   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6292                             &overflow);
6293   if (overflow)
6294     return false;
6295
6296   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6297                             TYPE_SIGN (lhs_type), &overflow);
6298   if (overflow)
6299     return false;
6300
6301   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6302           <= TYPE_PRECISION (lhs_type));
6303 }
6304
6305 /* Function vectorizable_reduction.
6306
6307    Check if STMT performs a reduction operation that can be vectorized.
6308    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6309    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6310    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6311
6312    This function also handles reduction idioms (patterns) that have been
6313    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6314    of this form:
6315      X = pattern_expr (arg0, arg1, ..., X)
6316    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6317    sequence that had been detected and replaced by the pattern-stmt (STMT).
6318
6319    This function also handles reduction of condition expressions, for example:
6320      for (int i = 0; i < N; i++)
6321        if (a[i] < value)
6322          last = a[i];
6323    This is handled by vectorising the loop and creating an additional vector
6324    containing the loop indexes for which "a[i] < value" was true.  In the
6325    function epilogue this is reduced to a single max value and then used to
6326    index into the vector of results.
6327
6328    In some cases of reduction patterns, the type of the reduction variable X is
6329    different than the type of the other arguments of STMT.
6330    In such cases, the vectype that is used when transforming STMT into a vector
6331    stmt is different than the vectype that is used to determine the
6332    vectorization factor, because it consists of a different number of elements
6333    than the actual number of elements that are being operated upon in parallel.
6334
6335    For example, consider an accumulation of shorts into an int accumulator.
6336    On some targets it's possible to vectorize this pattern operating on 8
6337    shorts at a time (hence, the vectype for purposes of determining the
6338    vectorization factor should be V8HI); on the other hand, the vectype that
6339    is used to create the vector form is actually V4SI (the type of the result).
6340
6341    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6342    indicates what is the actual level of parallelism (V8HI in the example), so
6343    that the right vectorization factor would be derived.  This vectype
6344    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6345    be used to create the vectorized stmt.  The right vectype for the vectorized
6346    stmt is obtained from the type of the result X:
6347         get_vectype_for_scalar_type (TREE_TYPE (X))
6348
6349    This means that, contrary to "regular" reductions (or "regular" stmts in
6350    general), the following equation:
6351       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6352    does *NOT* necessarily hold for reduction patterns.  */
6353
6354 bool
6355 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6356                         gimple **vec_stmt, slp_tree slp_node,
6357                         slp_instance slp_node_instance)
6358 {
6359   tree vec_dest;
6360   tree scalar_dest;
6361   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6362   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6363   tree vectype_in = NULL_TREE;
6364   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6365   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6366   enum tree_code code, orig_code;
6367   internal_fn reduc_fn;
6368   machine_mode vec_mode;
6369   int op_type;
6370   optab optab;
6371   tree new_temp = NULL_TREE;
6372   gimple *def_stmt;
6373   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6374   gimple *cond_reduc_def_stmt = NULL;
6375   enum tree_code cond_reduc_op_code = ERROR_MARK;
6376   tree scalar_type;
6377   bool is_simple_use;
6378   gimple *orig_stmt;
6379   stmt_vec_info orig_stmt_info = NULL;
6380   int i;
6381   int ncopies;
6382   int epilog_copies;
6383   stmt_vec_info prev_stmt_info, prev_phi_info;
6384   bool single_defuse_cycle = false;
6385   gimple *new_stmt = NULL;
6386   int j;
6387   tree ops[3];
6388   enum vect_def_type dts[3];
6389   bool nested_cycle = false, found_nested_cycle_def = false;
6390   bool double_reduc = false;
6391   basic_block def_bb;
6392   struct loop * def_stmt_loop, *outer_loop = NULL;
6393   tree def_arg;
6394   gimple *def_arg_stmt;
6395   auto_vec<tree> vec_oprnds0;
6396   auto_vec<tree> vec_oprnds1;
6397   auto_vec<tree> vec_oprnds2;
6398   auto_vec<tree> vect_defs;
6399   auto_vec<gimple *> phis;
6400   int vec_num;
6401   tree def0, tem;
6402   bool first_p = true;
6403   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6404   tree cond_reduc_val = NULL_TREE;
6405
6406   /* Make sure it was already recognized as a reduction computation.  */
6407   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6408       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6409     return false;
6410
6411   if (nested_in_vect_loop_p (loop, stmt))
6412     {
6413       outer_loop = loop;
6414       loop = loop->inner;
6415       nested_cycle = true;
6416     }
6417
6418   /* In case of reduction chain we switch to the first stmt in the chain, but
6419      we don't update STMT_INFO, since only the last stmt is marked as reduction
6420      and has reduction properties.  */
6421   if (GROUP_FIRST_ELEMENT (stmt_info)
6422       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6423     {
6424       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6425       first_p = false;
6426     }
6427
6428   if (gimple_code (stmt) == GIMPLE_PHI)
6429     {
6430       /* Analysis is fully done on the reduction stmt invocation.  */
6431       if (! vec_stmt)
6432         {
6433           if (slp_node)
6434             slp_node_instance->reduc_phis = slp_node;
6435
6436           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6437           return true;
6438         }
6439
6440       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6441         /* Leave the scalar phi in place.  Note that checking
6442            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6443            for reductions involving a single statement.  */
6444         return true;
6445
6446       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6447       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6448         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6449
6450       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6451           == EXTRACT_LAST_REDUCTION)
6452         /* Leave the scalar phi in place.  */
6453         return true;
6454
6455       gcc_assert (is_gimple_assign (reduc_stmt));
6456       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6457         {
6458           tree op = gimple_op (reduc_stmt, k);
6459           if (op == gimple_phi_result (stmt))
6460             continue;
6461           if (k == 1
6462               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6463             continue;
6464           if (!vectype_in
6465               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6466                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6467             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6468           break;
6469         }
6470       gcc_assert (vectype_in);
6471
6472       if (slp_node)
6473         ncopies = 1;
6474       else
6475         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6476
6477       use_operand_p use_p;
6478       gimple *use_stmt;
6479       if (ncopies > 1
6480           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6481               <= vect_used_only_live)
6482           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6483           && (use_stmt == reduc_stmt
6484               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6485                   == reduc_stmt)))
6486         single_defuse_cycle = true;
6487
6488       /* Create the destination vector  */
6489       scalar_dest = gimple_assign_lhs (reduc_stmt);
6490       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6491
6492       if (slp_node)
6493         /* The size vect_schedule_slp_instance computes is off for us.  */
6494         vec_num = vect_get_num_vectors
6495           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6496            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6497            vectype_in);
6498       else
6499         vec_num = 1;
6500
6501       /* Generate the reduction PHIs upfront.  */
6502       prev_phi_info = NULL;
6503       for (j = 0; j < ncopies; j++)
6504         {
6505           if (j == 0 || !single_defuse_cycle)
6506             {
6507               for (i = 0; i < vec_num; i++)
6508                 {
6509                   /* Create the reduction-phi that defines the reduction
6510                      operand.  */
6511                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6512                   set_vinfo_for_stmt (new_phi,
6513                                       new_stmt_vec_info (new_phi, loop_vinfo));
6514
6515                   if (slp_node)
6516                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6517                   else
6518                     {
6519                       if (j == 0)
6520                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6521                       else
6522                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6523                       prev_phi_info = vinfo_for_stmt (new_phi);
6524                     }
6525                 }
6526             }
6527         }
6528
6529       return true;
6530     }
6531
6532   /* 1. Is vectorizable reduction?  */
6533   /* Not supportable if the reduction variable is used in the loop, unless
6534      it's a reduction chain.  */
6535   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6536       && !GROUP_FIRST_ELEMENT (stmt_info))
6537     return false;
6538
6539   /* Reductions that are not used even in an enclosing outer-loop,
6540      are expected to be "live" (used out of the loop).  */
6541   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6542       && !STMT_VINFO_LIVE_P (stmt_info))
6543     return false;
6544
6545   /* 2. Has this been recognized as a reduction pattern?
6546
6547      Check if STMT represents a pattern that has been recognized
6548      in earlier analysis stages.  For stmts that represent a pattern,
6549      the STMT_VINFO_RELATED_STMT field records the last stmt in
6550      the original sequence that constitutes the pattern.  */
6551
6552   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6553   if (orig_stmt)
6554     {
6555       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6556       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6557       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6558     }
6559
6560   /* 3. Check the operands of the operation.  The first operands are defined
6561         inside the loop body. The last operand is the reduction variable,
6562         which is defined by the loop-header-phi.  */
6563
6564   gcc_assert (is_gimple_assign (stmt));
6565
6566   /* Flatten RHS.  */
6567   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6568     {
6569     case GIMPLE_BINARY_RHS:
6570       code = gimple_assign_rhs_code (stmt);
6571       op_type = TREE_CODE_LENGTH (code);
6572       gcc_assert (op_type == binary_op);
6573       ops[0] = gimple_assign_rhs1 (stmt);
6574       ops[1] = gimple_assign_rhs2 (stmt);
6575       break;
6576
6577     case GIMPLE_TERNARY_RHS:
6578       code = gimple_assign_rhs_code (stmt);
6579       op_type = TREE_CODE_LENGTH (code);
6580       gcc_assert (op_type == ternary_op);
6581       ops[0] = gimple_assign_rhs1 (stmt);
6582       ops[1] = gimple_assign_rhs2 (stmt);
6583       ops[2] = gimple_assign_rhs3 (stmt);
6584       break;
6585
6586     case GIMPLE_UNARY_RHS:
6587       return false;
6588
6589     default:
6590       gcc_unreachable ();
6591     }
6592
6593   if (code == COND_EXPR && slp_node)
6594     return false;
6595
6596   scalar_dest = gimple_assign_lhs (stmt);
6597   scalar_type = TREE_TYPE (scalar_dest);
6598   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6599       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6600     return false;
6601
6602   /* Do not try to vectorize bit-precision reductions.  */
6603   if (!type_has_mode_precision_p (scalar_type))
6604     return false;
6605
6606   /* All uses but the last are expected to be defined in the loop.
6607      The last use is the reduction variable.  In case of nested cycle this
6608      assumption is not true: we use reduc_index to record the index of the
6609      reduction variable.  */
6610   gimple *reduc_def_stmt = NULL;
6611   int reduc_index = -1;
6612   for (i = 0; i < op_type; i++)
6613     {
6614       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6615       if (i == 0 && code == COND_EXPR)
6616         continue;
6617
6618       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6619                                           &def_stmt, &dts[i], &tem);
6620       dt = dts[i];
6621       gcc_assert (is_simple_use);
6622       if (dt == vect_reduction_def)
6623         {
6624           reduc_def_stmt = def_stmt;
6625           reduc_index = i;
6626           continue;
6627         }
6628       else if (tem)
6629         {
6630           /* To properly compute ncopies we are interested in the widest
6631              input type in case we're looking at a widening accumulation.  */
6632           if (!vectype_in
6633               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6634                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6635             vectype_in = tem;
6636         }
6637
6638       if (dt != vect_internal_def
6639           && dt != vect_external_def
6640           && dt != vect_constant_def
6641           && dt != vect_induction_def
6642           && !(dt == vect_nested_cycle && nested_cycle))
6643         return false;
6644
6645       if (dt == vect_nested_cycle)
6646         {
6647           found_nested_cycle_def = true;
6648           reduc_def_stmt = def_stmt;
6649           reduc_index = i;
6650         }
6651
6652       if (i == 1 && code == COND_EXPR)
6653         {
6654           /* Record how value of COND_EXPR is defined.  */
6655           if (dt == vect_constant_def)
6656             {
6657               cond_reduc_dt = dt;
6658               cond_reduc_val = ops[i];
6659             }
6660           if (dt == vect_induction_def
6661               && def_stmt != NULL
6662               && is_nonwrapping_integer_induction (def_stmt, loop))
6663             {
6664               cond_reduc_dt = dt;
6665               cond_reduc_def_stmt = def_stmt;
6666             }
6667         }
6668     }
6669
6670   if (!vectype_in)
6671     vectype_in = vectype_out;
6672
6673   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6674      directy used in stmt.  */
6675   if (reduc_index == -1)
6676     {
6677       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6678         {
6679           if (dump_enabled_p ())
6680             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6681                              "in-order reduction chain without SLP.\n");
6682           return false;
6683         }
6684
6685       if (orig_stmt)
6686         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6687       else
6688         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6689     }
6690
6691   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6692     return false;
6693
6694   if (!(reduc_index == -1
6695         || dts[reduc_index] == vect_reduction_def
6696         || dts[reduc_index] == vect_nested_cycle
6697         || ((dts[reduc_index] == vect_internal_def
6698              || dts[reduc_index] == vect_external_def
6699              || dts[reduc_index] == vect_constant_def
6700              || dts[reduc_index] == vect_induction_def)
6701             && nested_cycle && found_nested_cycle_def)))
6702     {
6703       /* For pattern recognized stmts, orig_stmt might be a reduction,
6704          but some helper statements for the pattern might not, or
6705          might be COND_EXPRs with reduction uses in the condition.  */
6706       gcc_assert (orig_stmt);
6707       return false;
6708     }
6709
6710   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6711   enum vect_reduction_type v_reduc_type
6712     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6713   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6714
6715   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6716   /* If we have a condition reduction, see if we can simplify it further.  */
6717   if (v_reduc_type == COND_REDUCTION)
6718     {
6719       /* Loop peeling modifies initial value of reduction PHI, which
6720          makes the reduction stmt to be transformed different to the
6721          original stmt analyzed.  We need to record reduction code for
6722          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6723          it can be used directly at transform stage.  */
6724       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6725           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6726         {
6727           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6728           gcc_assert (cond_reduc_dt == vect_constant_def);
6729           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6730         }
6731       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6732                                                vectype_in, OPTIMIZE_FOR_SPEED))
6733         {
6734           if (dump_enabled_p ())
6735             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6736                              "optimizing condition reduction with"
6737                              " FOLD_EXTRACT_LAST.\n");
6738           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6739         }
6740       else if (cond_reduc_dt == vect_induction_def)
6741         {
6742           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6743           tree base
6744             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6745           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6746
6747           gcc_assert (TREE_CODE (base) == INTEGER_CST
6748                       && TREE_CODE (step) == INTEGER_CST);
6749           cond_reduc_val = NULL_TREE;
6750           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6751              above base; punt if base is the minimum value of the type for
6752              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6753           if (tree_int_cst_sgn (step) == -1)
6754             {
6755               cond_reduc_op_code = MIN_EXPR;
6756               if (tree_int_cst_sgn (base) == -1)
6757                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6758               else if (tree_int_cst_lt (base,
6759                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6760                 cond_reduc_val
6761                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6762             }
6763           else
6764             {
6765               cond_reduc_op_code = MAX_EXPR;
6766               if (tree_int_cst_sgn (base) == 1)
6767                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6768               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6769                                         base))
6770                 cond_reduc_val
6771                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6772             }
6773           if (cond_reduc_val)
6774             {
6775               if (dump_enabled_p ())
6776                 dump_printf_loc (MSG_NOTE, vect_location,
6777                                  "condition expression based on "
6778                                  "integer induction.\n");
6779               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6780                 = INTEGER_INDUC_COND_REDUCTION;
6781             }
6782         }
6783       else if (cond_reduc_dt == vect_constant_def)
6784         {
6785           enum vect_def_type cond_initial_dt;
6786           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6787           tree cond_initial_val
6788             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6789
6790           gcc_assert (cond_reduc_val != NULL_TREE);
6791           vect_is_simple_use (cond_initial_val, loop_vinfo,
6792                               &def_stmt, &cond_initial_dt);
6793           if (cond_initial_dt == vect_constant_def
6794               && types_compatible_p (TREE_TYPE (cond_initial_val),
6795                                      TREE_TYPE (cond_reduc_val)))
6796             {
6797               tree e = fold_binary (LE_EXPR, boolean_type_node,
6798                                     cond_initial_val, cond_reduc_val);
6799               if (e && (integer_onep (e) || integer_zerop (e)))
6800                 {
6801                   if (dump_enabled_p ())
6802                     dump_printf_loc (MSG_NOTE, vect_location,
6803                                      "condition expression based on "
6804                                      "compile time constant.\n");
6805                   /* Record reduction code at analysis stage.  */
6806                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6807                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6808                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6809                     = CONST_COND_REDUCTION;
6810                 }
6811             }
6812         }
6813     }
6814
6815   if (orig_stmt)
6816     gcc_assert (tmp == orig_stmt
6817                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6818   else
6819     /* We changed STMT to be the first stmt in reduction chain, hence we
6820        check that in this case the first element in the chain is STMT.  */
6821     gcc_assert (stmt == tmp
6822                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6823
6824   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6825     return false;
6826
6827   if (slp_node)
6828     ncopies = 1;
6829   else
6830     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6831
6832   gcc_assert (ncopies >= 1);
6833
6834   vec_mode = TYPE_MODE (vectype_in);
6835   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6836
6837   if (code == COND_EXPR)
6838     {
6839       /* Only call during the analysis stage, otherwise we'll lose
6840          STMT_VINFO_TYPE.  */
6841       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6842                                                 ops[reduc_index], 0, NULL))
6843         {
6844           if (dump_enabled_p ())
6845             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6846                              "unsupported condition in reduction\n");
6847           return false;
6848         }
6849     }
6850   else
6851     {
6852       /* 4. Supportable by target?  */
6853
6854       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6855           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6856         {
6857           /* Shifts and rotates are only supported by vectorizable_shifts,
6858              not vectorizable_reduction.  */
6859           if (dump_enabled_p ())
6860             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6861                              "unsupported shift or rotation.\n");
6862           return false;
6863         }
6864
6865       /* 4.1. check support for the operation in the loop  */
6866       optab = optab_for_tree_code (code, vectype_in, optab_default);
6867       if (!optab)
6868         {
6869           if (dump_enabled_p ())
6870             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6871                              "no optab.\n");
6872
6873           return false;
6874         }
6875
6876       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6877         {
6878           if (dump_enabled_p ())
6879             dump_printf (MSG_NOTE, "op not supported by target.\n");
6880
6881           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6882               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6883             return false;
6884
6885           if (dump_enabled_p ())
6886             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6887         }
6888
6889       /* Worthwhile without SIMD support?  */
6890       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6891           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6892         {
6893           if (dump_enabled_p ())
6894             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6895                              "not worthwhile without SIMD support.\n");
6896
6897           return false;
6898         }
6899     }
6900
6901   /* 4.2. Check support for the epilog operation.
6902
6903           If STMT represents a reduction pattern, then the type of the
6904           reduction variable may be different than the type of the rest
6905           of the arguments.  For example, consider the case of accumulation
6906           of shorts into an int accumulator; The original code:
6907                         S1: int_a = (int) short_a;
6908           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6909
6910           was replaced with:
6911                         STMT: int_acc = widen_sum <short_a, int_acc>
6912
6913           This means that:
6914           1. The tree-code that is used to create the vector operation in the
6915              epilog code (that reduces the partial results) is not the
6916              tree-code of STMT, but is rather the tree-code of the original
6917              stmt from the pattern that STMT is replacing.  I.e, in the example
6918              above we want to use 'widen_sum' in the loop, but 'plus' in the
6919              epilog.
6920           2. The type (mode) we use to check available target support
6921              for the vector operation to be created in the *epilog*, is
6922              determined by the type of the reduction variable (in the example
6923              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6924              However the type (mode) we use to check available target support
6925              for the vector operation to be created *inside the loop*, is
6926              determined by the type of the other arguments to STMT (in the
6927              example we'd check this: optab_handler (widen_sum_optab,
6928              vect_short_mode)).
6929
6930           This is contrary to "regular" reductions, in which the types of all
6931           the arguments are the same as the type of the reduction variable.
6932           For "regular" reductions we can therefore use the same vector type
6933           (and also the same tree-code) when generating the epilog code and
6934           when generating the code inside the loop.  */
6935
6936   vect_reduction_type reduction_type
6937     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6938   if (orig_stmt
6939       && (reduction_type == TREE_CODE_REDUCTION
6940           || reduction_type == FOLD_LEFT_REDUCTION))
6941     {
6942       /* This is a reduction pattern: get the vectype from the type of the
6943          reduction variable, and get the tree-code from orig_stmt.  */
6944       orig_code = gimple_assign_rhs_code (orig_stmt);
6945       gcc_assert (vectype_out);
6946       vec_mode = TYPE_MODE (vectype_out);
6947     }
6948   else
6949     {
6950       /* Regular reduction: use the same vectype and tree-code as used for
6951          the vector code inside the loop can be used for the epilog code. */
6952       orig_code = code;
6953
6954       if (code == MINUS_EXPR)
6955         orig_code = PLUS_EXPR;
6956
6957       /* For simple condition reductions, replace with the actual expression
6958          we want to base our reduction around.  */
6959       if (reduction_type == CONST_COND_REDUCTION)
6960         {
6961           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6962           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6963         }
6964       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6965         orig_code = cond_reduc_op_code;
6966     }
6967
6968   if (nested_cycle)
6969     {
6970       def_bb = gimple_bb (reduc_def_stmt);
6971       def_stmt_loop = def_bb->loop_father;
6972       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6973                                        loop_preheader_edge (def_stmt_loop));
6974       if (TREE_CODE (def_arg) == SSA_NAME
6975           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6976           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6977           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6978           && vinfo_for_stmt (def_arg_stmt)
6979           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6980               == vect_double_reduction_def)
6981         double_reduc = true;
6982     }
6983
6984   reduc_fn = IFN_LAST;
6985
6986   if (reduction_type == TREE_CODE_REDUCTION
6987       || reduction_type == FOLD_LEFT_REDUCTION
6988       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6989       || reduction_type == CONST_COND_REDUCTION)
6990     {
6991       if (reduction_type == FOLD_LEFT_REDUCTION
6992           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6993           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6994         {
6995           if (reduc_fn != IFN_LAST
6996               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6997                                                   OPTIMIZE_FOR_SPEED))
6998             {
6999               if (dump_enabled_p ())
7000                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7001                                  "reduc op not supported by target.\n");
7002
7003               reduc_fn = IFN_LAST;
7004             }
7005         }
7006       else
7007         {
7008           if (!nested_cycle || double_reduc)
7009             {
7010               if (dump_enabled_p ())
7011                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7012                                  "no reduc code for scalar code.\n");
7013
7014               return false;
7015             }
7016         }
7017     }
7018   else if (reduction_type == COND_REDUCTION)
7019     {
7020       int scalar_precision
7021         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7022       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7023       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7024                                                 nunits_out);
7025
7026       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7027                                           OPTIMIZE_FOR_SPEED))
7028         reduc_fn = IFN_REDUC_MAX;
7029     }
7030
7031   if (reduction_type != EXTRACT_LAST_REDUCTION
7032       && reduc_fn == IFN_LAST
7033       && !nunits_out.is_constant ())
7034     {
7035       if (dump_enabled_p ())
7036         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7037                          "missing target support for reduction on"
7038                          " variable-length vectors.\n");
7039       return false;
7040     }
7041
7042   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7043       && ncopies > 1)
7044     {
7045       if (dump_enabled_p ())
7046         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7047                          "multiple types in double reduction or condition "
7048                          "reduction.\n");
7049       return false;
7050     }
7051
7052   /* For SLP reductions, see if there is a neutral value we can use.  */
7053   tree neutral_op = NULL_TREE;
7054   if (slp_node)
7055     neutral_op
7056       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7057                                       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7058
7059   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7060     {
7061       /* We can't support in-order reductions of code such as this:
7062
7063            for (int i = 0; i < n1; ++i)
7064              for (int j = 0; j < n2; ++j)
7065                l += a[j];
7066
7067          since GCC effectively transforms the loop when vectorizing:
7068
7069            for (int i = 0; i < n1 / VF; ++i)
7070              for (int j = 0; j < n2; ++j)
7071                for (int k = 0; k < VF; ++k)
7072                  l += a[j];
7073
7074          which is a reassociation of the original operation.  */
7075       if (dump_enabled_p ())
7076         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7077                          "in-order double reduction not supported.\n");
7078
7079       return false;
7080     }
7081
7082   if (reduction_type == FOLD_LEFT_REDUCTION
7083       && slp_node
7084       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7085     {
7086       /* We cannot use in-order reductions in this case because there is
7087          an implicit reassociation of the operations involved.  */
7088       if (dump_enabled_p ())
7089         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7090                          "in-order unchained SLP reductions not supported.\n");
7091       return false;
7092     }
7093
7094   /* For double reductions, and for SLP reductions with a neutral value,
7095      we construct a variable-length initial vector by loading a vector
7096      full of the neutral value and then shift-and-inserting the start
7097      values into the low-numbered elements.  */
7098   if ((double_reduc || neutral_op)
7099       && !nunits_out.is_constant ()
7100       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7101                                           vectype_out, OPTIMIZE_FOR_SPEED))
7102     {
7103       if (dump_enabled_p ())
7104         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7105                          "reduction on variable-length vectors requires"
7106                          " target support for a vector-shift-and-insert"
7107                          " operation.\n");
7108       return false;
7109     }
7110
7111   /* Check extra constraints for variable-length unchained SLP reductions.  */
7112   if (STMT_SLP_TYPE (stmt_info)
7113       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7114       && !nunits_out.is_constant ())
7115     {
7116       /* We checked above that we could build the initial vector when
7117          there's a neutral element value.  Check here for the case in
7118          which each SLP statement has its own initial value and in which
7119          that value needs to be repeated for every instance of the
7120          statement within the initial vector.  */
7121       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7122       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7123       if (!neutral_op
7124           && !can_duplicate_and_interleave_p (group_size, elt_mode))
7125         {
7126           if (dump_enabled_p ())
7127             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7128                              "unsupported form of SLP reduction for"
7129                              " variable-length vectors: cannot build"
7130                              " initial vector.\n");
7131           return false;
7132         }
7133       /* The epilogue code relies on the number of elements being a multiple
7134          of the group size.  The duplicate-and-interleave approach to setting
7135          up the the initial vector does too.  */
7136       if (!multiple_p (nunits_out, group_size))
7137         {
7138           if (dump_enabled_p ())
7139             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7140                              "unsupported form of SLP reduction for"
7141                              " variable-length vectors: the vector size"
7142                              " is not a multiple of the number of results.\n");
7143           return false;
7144         }
7145     }
7146
7147   /* In case of widenning multiplication by a constant, we update the type
7148      of the constant to be the type of the other operand.  We check that the
7149      constant fits the type in the pattern recognition pass.  */
7150   if (code == DOT_PROD_EXPR
7151       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7152     {
7153       if (TREE_CODE (ops[0]) == INTEGER_CST)
7154         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7155       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7156         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7157       else
7158         {
7159           if (dump_enabled_p ())
7160             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7161                              "invalid types in dot-prod\n");
7162
7163           return false;
7164         }
7165     }
7166
7167   if (reduction_type == COND_REDUCTION)
7168     {
7169       widest_int ni;
7170
7171       if (! max_loop_iterations (loop, &ni))
7172         {
7173           if (dump_enabled_p ())
7174             dump_printf_loc (MSG_NOTE, vect_location,
7175                              "loop count not known, cannot create cond "
7176                              "reduction.\n");
7177           return false;
7178         }
7179       /* Convert backedges to iterations.  */
7180       ni += 1;
7181
7182       /* The additional index will be the same type as the condition.  Check
7183          that the loop can fit into this less one (because we'll use up the
7184          zero slot for when there are no matches).  */
7185       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7186       if (wi::geu_p (ni, wi::to_widest (max_index)))
7187         {
7188           if (dump_enabled_p ())
7189             dump_printf_loc (MSG_NOTE, vect_location,
7190                              "loop size is greater than data size.\n");
7191           return false;
7192         }
7193     }
7194
7195   /* In case the vectorization factor (VF) is bigger than the number
7196      of elements that we can fit in a vectype (nunits), we have to generate
7197      more than one vector stmt - i.e - we need to "unroll" the
7198      vector stmt by a factor VF/nunits.  For more details see documentation
7199      in vectorizable_operation.  */
7200
7201   /* If the reduction is used in an outer loop we need to generate
7202      VF intermediate results, like so (e.g. for ncopies=2):
7203         r0 = phi (init, r0)
7204         r1 = phi (init, r1)
7205         r0 = x0 + r0;
7206         r1 = x1 + r1;
7207     (i.e. we generate VF results in 2 registers).
7208     In this case we have a separate def-use cycle for each copy, and therefore
7209     for each copy we get the vector def for the reduction variable from the
7210     respective phi node created for this copy.
7211
7212     Otherwise (the reduction is unused in the loop nest), we can combine
7213     together intermediate results, like so (e.g. for ncopies=2):
7214         r = phi (init, r)
7215         r = x0 + r;
7216         r = x1 + r;
7217    (i.e. we generate VF/2 results in a single register).
7218    In this case for each copy we get the vector def for the reduction variable
7219    from the vectorized reduction operation generated in the previous iteration.
7220
7221    This only works when we see both the reduction PHI and its only consumer
7222    in vectorizable_reduction and there are no intermediate stmts
7223    participating.  */
7224   use_operand_p use_p;
7225   gimple *use_stmt;
7226   if (ncopies > 1
7227       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7228       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7229       && (use_stmt == stmt
7230           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7231     {
7232       single_defuse_cycle = true;
7233       epilog_copies = 1;
7234     }
7235   else
7236     epilog_copies = ncopies;
7237
7238   /* If the reduction stmt is one of the patterns that have lane
7239      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7240   if ((ncopies > 1
7241        && ! single_defuse_cycle)
7242       && (code == DOT_PROD_EXPR
7243           || code == WIDEN_SUM_EXPR
7244           || code == SAD_EXPR))
7245     {
7246       if (dump_enabled_p ())
7247         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7248                          "multi def-use cycle not possible for lane-reducing "
7249                          "reduction operation\n");
7250       return false;
7251     }
7252
7253   if (slp_node)
7254     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7255   else
7256     vec_num = 1;
7257
7258   internal_fn cond_fn = get_conditional_internal_fn (code);
7259   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7260
7261   if (!vec_stmt) /* transformation not required.  */
7262     {
7263       if (first_p)
7264         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7265       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7266         {
7267           if (reduction_type != FOLD_LEFT_REDUCTION
7268               && (cond_fn == IFN_LAST
7269                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7270                                                       OPTIMIZE_FOR_SPEED)))
7271             {
7272               if (dump_enabled_p ())
7273                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7274                                  "can't use a fully-masked loop because no"
7275                                  " conditional operation is available.\n");
7276               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7277             }
7278           else if (reduc_index == -1)
7279             {
7280               if (dump_enabled_p ())
7281                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7282                                  "can't use a fully-masked loop for chained"
7283                                  " reductions.\n");
7284               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7285             }
7286           else
7287             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7288                                    vectype_in);
7289         }
7290       if (dump_enabled_p ()
7291           && reduction_type == FOLD_LEFT_REDUCTION)
7292         dump_printf_loc (MSG_NOTE, vect_location,
7293                          "using an in-order (fold-left) reduction.\n");
7294       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7295       return true;
7296     }
7297
7298   /* Transform.  */
7299
7300   if (dump_enabled_p ())
7301     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7302
7303   /* FORNOW: Multiple types are not supported for condition.  */
7304   if (code == COND_EXPR)
7305     gcc_assert (ncopies == 1);
7306
7307   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7308
7309   if (reduction_type == FOLD_LEFT_REDUCTION)
7310     return vectorize_fold_left_reduction
7311       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7312        reduc_fn, ops, vectype_in, reduc_index, masks);
7313
7314   if (reduction_type == EXTRACT_LAST_REDUCTION)
7315     {
7316       gcc_assert (!slp_node);
7317       return vectorizable_condition (stmt, gsi, vec_stmt,
7318                                      NULL, reduc_index, NULL);
7319     }
7320
7321   /* Create the destination vector  */
7322   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7323
7324   prev_stmt_info = NULL;
7325   prev_phi_info = NULL;
7326   if (!slp_node)
7327     {
7328       vec_oprnds0.create (1);
7329       vec_oprnds1.create (1);
7330       if (op_type == ternary_op)
7331         vec_oprnds2.create (1);
7332     }
7333
7334   phis.create (vec_num);
7335   vect_defs.create (vec_num);
7336   if (!slp_node)
7337     vect_defs.quick_push (NULL_TREE);
7338
7339   if (slp_node)
7340     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7341   else
7342     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7343
7344   for (j = 0; j < ncopies; j++)
7345     {
7346       if (code == COND_EXPR)
7347         {
7348           gcc_assert (!slp_node);
7349           vectorizable_condition (stmt, gsi, vec_stmt,
7350                                   PHI_RESULT (phis[0]),
7351                                   reduc_index, NULL);
7352           /* Multiple types are not supported for condition.  */
7353           break;
7354         }
7355
7356       /* Handle uses.  */
7357       if (j == 0)
7358         {
7359           if (slp_node)
7360             {
7361               /* Get vec defs for all the operands except the reduction index,
7362                  ensuring the ordering of the ops in the vector is kept.  */
7363               auto_vec<tree, 3> slp_ops;
7364               auto_vec<vec<tree>, 3> vec_defs;
7365
7366               slp_ops.quick_push (ops[0]);
7367               slp_ops.quick_push (ops[1]);
7368               if (op_type == ternary_op)
7369                 slp_ops.quick_push (ops[2]);
7370
7371               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7372
7373               vec_oprnds0.safe_splice (vec_defs[0]);
7374               vec_defs[0].release ();
7375               vec_oprnds1.safe_splice (vec_defs[1]);
7376               vec_defs[1].release ();
7377               if (op_type == ternary_op)
7378                 {
7379                   vec_oprnds2.safe_splice (vec_defs[2]);
7380                   vec_defs[2].release ();
7381                 }
7382             }
7383           else
7384             {
7385               vec_oprnds0.quick_push
7386                 (vect_get_vec_def_for_operand (ops[0], stmt));
7387               vec_oprnds1.quick_push
7388                 (vect_get_vec_def_for_operand (ops[1], stmt));
7389               if (op_type == ternary_op)
7390                 vec_oprnds2.quick_push
7391                   (vect_get_vec_def_for_operand (ops[2], stmt));
7392             }
7393         }
7394       else
7395         {
7396           if (!slp_node)
7397             {
7398               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7399
7400               if (single_defuse_cycle && reduc_index == 0)
7401                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7402               else
7403                 vec_oprnds0[0]
7404                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7405               if (single_defuse_cycle && reduc_index == 1)
7406                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7407               else
7408                 vec_oprnds1[0]
7409                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7410               if (op_type == ternary_op)
7411                 {
7412                   if (single_defuse_cycle && reduc_index == 2)
7413                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7414                   else
7415                     vec_oprnds2[0]
7416                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7417                 }
7418             }
7419         }
7420
7421       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7422         {
7423           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7424           if (masked_loop_p)
7425             {
7426               /* Make sure that the reduction accumulator is vop[0].  */
7427               if (reduc_index == 1)
7428                 {
7429                   gcc_assert (commutative_tree_code (code));
7430                   std::swap (vop[0], vop[1]);
7431                 }
7432               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7433                                               vectype_in, i * ncopies + j);
7434               gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7435                                                         vop[0], vop[1]);
7436               new_temp = make_ssa_name (vec_dest, call);
7437               gimple_call_set_lhs (call, new_temp);
7438               gimple_call_set_nothrow (call, true);
7439               new_stmt = call;
7440             }
7441           else
7442             {
7443               if (op_type == ternary_op)
7444                 vop[2] = vec_oprnds2[i];
7445
7446               new_temp = make_ssa_name (vec_dest, new_stmt);
7447               new_stmt = gimple_build_assign (new_temp, code,
7448                                               vop[0], vop[1], vop[2]);
7449             }
7450           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7451
7452           if (slp_node)
7453             {
7454               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7455               vect_defs.quick_push (new_temp);
7456             }
7457           else
7458             vect_defs[0] = new_temp;
7459         }
7460
7461       if (slp_node)
7462         continue;
7463
7464       if (j == 0)
7465         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7466       else
7467         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7468
7469       prev_stmt_info = vinfo_for_stmt (new_stmt);
7470     }
7471
7472   /* Finalize the reduction-phi (set its arguments) and create the
7473      epilog reduction code.  */
7474   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7475     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7476
7477   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7478                                     epilog_copies, reduc_fn, phis,
7479                                     double_reduc, slp_node, slp_node_instance,
7480                                     cond_reduc_val, cond_reduc_op_code,
7481                                     neutral_op);
7482
7483   return true;
7484 }
7485
7486 /* Function vect_min_worthwhile_factor.
7487
7488    For a loop where we could vectorize the operation indicated by CODE,
7489    return the minimum vectorization factor that makes it worthwhile
7490    to use generic vectors.  */
7491 static unsigned int
7492 vect_min_worthwhile_factor (enum tree_code code)
7493 {
7494   switch (code)
7495     {
7496     case PLUS_EXPR:
7497     case MINUS_EXPR:
7498     case NEGATE_EXPR:
7499       return 4;
7500
7501     case BIT_AND_EXPR:
7502     case BIT_IOR_EXPR:
7503     case BIT_XOR_EXPR:
7504     case BIT_NOT_EXPR:
7505       return 2;
7506
7507     default:
7508       return INT_MAX;
7509     }
7510 }
7511
7512 /* Return true if VINFO indicates we are doing loop vectorization and if
7513    it is worth decomposing CODE operations into scalar operations for
7514    that loop's vectorization factor.  */
7515
7516 bool
7517 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7518 {
7519   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7520   unsigned HOST_WIDE_INT value;
7521   return (loop_vinfo
7522           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7523           && value >= vect_min_worthwhile_factor (code));
7524 }
7525
7526 /* Function vectorizable_induction
7527
7528    Check if PHI performs an induction computation that can be vectorized.
7529    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7530    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7531    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7532
7533 bool
7534 vectorizable_induction (gimple *phi,
7535                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7536                         gimple **vec_stmt, slp_tree slp_node)
7537 {
7538   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7539   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7540   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7541   unsigned ncopies;
7542   bool nested_in_vect_loop = false;
7543   struct loop *iv_loop;
7544   tree vec_def;
7545   edge pe = loop_preheader_edge (loop);
7546   basic_block new_bb;
7547   tree new_vec, vec_init, vec_step, t;
7548   tree new_name;
7549   gimple *new_stmt;
7550   gphi *induction_phi;
7551   tree induc_def, vec_dest;
7552   tree init_expr, step_expr;
7553   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7554   unsigned i;
7555   tree expr;
7556   gimple_seq stmts;
7557   imm_use_iterator imm_iter;
7558   use_operand_p use_p;
7559   gimple *exit_phi;
7560   edge latch_e;
7561   tree loop_arg;
7562   gimple_stmt_iterator si;
7563   basic_block bb = gimple_bb (phi);
7564
7565   if (gimple_code (phi) != GIMPLE_PHI)
7566     return false;
7567
7568   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7569     return false;
7570
7571   /* Make sure it was recognized as induction computation.  */
7572   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7573     return false;
7574
7575   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7576   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7577
7578   if (slp_node)
7579     ncopies = 1;
7580   else
7581     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7582   gcc_assert (ncopies >= 1);
7583
7584   /* FORNOW. These restrictions should be relaxed.  */
7585   if (nested_in_vect_loop_p (loop, phi))
7586     {
7587       imm_use_iterator imm_iter;
7588       use_operand_p use_p;
7589       gimple *exit_phi;
7590       edge latch_e;
7591       tree loop_arg;
7592
7593       if (ncopies > 1)
7594         {
7595           if (dump_enabled_p ())
7596             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7597                              "multiple types in nested loop.\n");
7598           return false;
7599         }
7600
7601       /* FORNOW: outer loop induction with SLP not supported.  */
7602       if (STMT_SLP_TYPE (stmt_info))
7603         return false;
7604
7605       exit_phi = NULL;
7606       latch_e = loop_latch_edge (loop->inner);
7607       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7608       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7609         {
7610           gimple *use_stmt = USE_STMT (use_p);
7611           if (is_gimple_debug (use_stmt))
7612             continue;
7613
7614           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7615             {
7616               exit_phi = use_stmt;
7617               break;
7618             }
7619         }
7620       if (exit_phi)
7621         {
7622           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7623           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7624                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7625             {
7626               if (dump_enabled_p ())
7627                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7628                                  "inner-loop induction only used outside "
7629                                  "of the outer vectorized loop.\n");
7630               return false;
7631             }
7632         }
7633
7634       nested_in_vect_loop = true;
7635       iv_loop = loop->inner;
7636     }
7637   else
7638     iv_loop = loop;
7639   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7640
7641   if (slp_node && !nunits.is_constant ())
7642     {
7643       /* The current SLP code creates the initial value element-by-element.  */
7644       if (dump_enabled_p ())
7645         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7646                          "SLP induction not supported for variable-length"
7647                          " vectors.\n");
7648       return false;
7649     }
7650
7651   if (!vec_stmt) /* transformation not required.  */
7652     {
7653       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7654       if (dump_enabled_p ())
7655         dump_printf_loc (MSG_NOTE, vect_location,
7656                          "=== vectorizable_induction ===\n");
7657       vect_model_induction_cost (stmt_info, ncopies);
7658       return true;
7659     }
7660
7661   /* Transform.  */
7662
7663   /* Compute a vector variable, initialized with the first VF values of
7664      the induction variable.  E.g., for an iv with IV_PHI='X' and
7665      evolution S, for a vector of 4 units, we want to compute:
7666      [X, X + S, X + 2*S, X + 3*S].  */
7667
7668   if (dump_enabled_p ())
7669     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7670
7671   latch_e = loop_latch_edge (iv_loop);
7672   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7673
7674   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7675   gcc_assert (step_expr != NULL_TREE);
7676
7677   pe = loop_preheader_edge (iv_loop);
7678   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7679                                      loop_preheader_edge (iv_loop));
7680
7681   stmts = NULL;
7682   if (!nested_in_vect_loop)
7683     {
7684       /* Convert the initial value to the desired type.  */
7685       tree new_type = TREE_TYPE (vectype);
7686       init_expr = gimple_convert (&stmts, new_type, init_expr);
7687
7688       /* If we are using the loop mask to "peel" for alignment then we need
7689          to adjust the start value here.  */
7690       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7691       if (skip_niters != NULL_TREE)
7692         {
7693           if (FLOAT_TYPE_P (vectype))
7694             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7695                                         skip_niters);
7696           else
7697             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7698           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7699                                          skip_niters, step_expr);
7700           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7701                                     init_expr, skip_step);
7702         }
7703     }
7704
7705   /* Convert the step to the desired type.  */
7706   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7707
7708   if (stmts)
7709     {
7710       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7711       gcc_assert (!new_bb);
7712     }
7713
7714   /* Find the first insertion point in the BB.  */
7715   si = gsi_after_labels (bb);
7716
7717   /* For SLP induction we have to generate several IVs as for example
7718      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7719      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7720      [VF*S, VF*S, VF*S, VF*S] for all.  */
7721   if (slp_node)
7722     {
7723       /* Enforced above.  */
7724       unsigned int const_nunits = nunits.to_constant ();
7725
7726       /* Generate [VF*S, VF*S, ... ].  */
7727       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7728         {
7729           expr = build_int_cst (integer_type_node, vf);
7730           expr = fold_convert (TREE_TYPE (step_expr), expr);
7731         }
7732       else
7733         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7734       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7735                               expr, step_expr);
7736       if (! CONSTANT_CLASS_P (new_name))
7737         new_name = vect_init_vector (phi, new_name,
7738                                      TREE_TYPE (step_expr), NULL);
7739       new_vec = build_vector_from_val (vectype, new_name);
7740       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7741
7742       /* Now generate the IVs.  */
7743       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7744       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7745       unsigned elts = const_nunits * nvects;
7746       unsigned nivs = least_common_multiple (group_size,
7747                                              const_nunits) / const_nunits;
7748       gcc_assert (elts % group_size == 0);
7749       tree elt = init_expr;
7750       unsigned ivn;
7751       for (ivn = 0; ivn < nivs; ++ivn)
7752         {
7753           tree_vector_builder elts (vectype, const_nunits, 1);
7754           stmts = NULL;
7755           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7756             {
7757               if (ivn*const_nunits + eltn >= group_size
7758                   && (ivn * const_nunits + eltn) % group_size == 0)
7759                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7760                                     elt, step_expr);
7761               elts.quick_push (elt);
7762             }
7763           vec_init = gimple_build_vector (&stmts, &elts);
7764           if (stmts)
7765             {
7766               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7767               gcc_assert (!new_bb);
7768             }
7769
7770           /* Create the induction-phi that defines the induction-operand.  */
7771           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7772           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7773           set_vinfo_for_stmt (induction_phi,
7774                               new_stmt_vec_info (induction_phi, loop_vinfo));
7775           induc_def = PHI_RESULT (induction_phi);
7776
7777           /* Create the iv update inside the loop  */
7778           vec_def = make_ssa_name (vec_dest);
7779           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7780           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7781           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7782
7783           /* Set the arguments of the phi node:  */
7784           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7785           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7786                        UNKNOWN_LOCATION);
7787
7788           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7789         }
7790
7791       /* Re-use IVs when we can.  */
7792       if (ivn < nvects)
7793         {
7794           unsigned vfp
7795             = least_common_multiple (group_size, const_nunits) / group_size;
7796           /* Generate [VF'*S, VF'*S, ... ].  */
7797           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7798             {
7799               expr = build_int_cst (integer_type_node, vfp);
7800               expr = fold_convert (TREE_TYPE (step_expr), expr);
7801             }
7802           else
7803             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7804           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7805                                   expr, step_expr);
7806           if (! CONSTANT_CLASS_P (new_name))
7807             new_name = vect_init_vector (phi, new_name,
7808                                          TREE_TYPE (step_expr), NULL);
7809           new_vec = build_vector_from_val (vectype, new_name);
7810           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7811           for (; ivn < nvects; ++ivn)
7812             {
7813               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7814               tree def;
7815               if (gimple_code (iv) == GIMPLE_PHI)
7816                 def = gimple_phi_result (iv);
7817               else
7818                 def = gimple_assign_lhs (iv);
7819               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7820                                               PLUS_EXPR,
7821                                               def, vec_step);
7822               if (gimple_code (iv) == GIMPLE_PHI)
7823                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7824               else
7825                 {
7826                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7827                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7828                 }
7829               set_vinfo_for_stmt (new_stmt,
7830                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7831               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7832             }
7833         }
7834
7835       return true;
7836     }
7837
7838   /* Create the vector that holds the initial_value of the induction.  */
7839   if (nested_in_vect_loop)
7840     {
7841       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7842          been created during vectorization of previous stmts.  We obtain it
7843          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7844       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7845       /* If the initial value is not of proper type, convert it.  */
7846       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7847         {
7848           new_stmt
7849             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7850                                                           vect_simple_var,
7851                                                           "vec_iv_"),
7852                                    VIEW_CONVERT_EXPR,
7853                                    build1 (VIEW_CONVERT_EXPR, vectype,
7854                                            vec_init));
7855           vec_init = gimple_assign_lhs (new_stmt);
7856           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7857                                                  new_stmt);
7858           gcc_assert (!new_bb);
7859           set_vinfo_for_stmt (new_stmt,
7860                               new_stmt_vec_info (new_stmt, loop_vinfo));
7861         }
7862     }
7863   else
7864     {
7865       /* iv_loop is the loop to be vectorized. Create:
7866          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7867       stmts = NULL;
7868       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7869
7870       unsigned HOST_WIDE_INT const_nunits;
7871       if (nunits.is_constant (&const_nunits))
7872         {
7873           tree_vector_builder elts (vectype, const_nunits, 1);
7874           elts.quick_push (new_name);
7875           for (i = 1; i < const_nunits; i++)
7876             {
7877               /* Create: new_name_i = new_name + step_expr  */
7878               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7879                                        new_name, step_expr);
7880               elts.quick_push (new_name);
7881             }
7882           /* Create a vector from [new_name_0, new_name_1, ...,
7883              new_name_nunits-1]  */
7884           vec_init = gimple_build_vector (&stmts, &elts);
7885         }
7886       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7887         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7888         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7889                                  new_name, step_expr);
7890       else
7891         {
7892           /* Build:
7893                 [base, base, base, ...]
7894                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7895           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7896           gcc_assert (flag_associative_math);
7897           tree index = build_index_vector (vectype, 0, 1);
7898           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7899                                                         new_name);
7900           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7901                                                         step_expr);
7902           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7903           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7904                                    vec_init, step_vec);
7905           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7906                                    vec_init, base_vec);
7907         }
7908
7909       if (stmts)
7910         {
7911           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7912           gcc_assert (!new_bb);
7913         }
7914     }
7915
7916
7917   /* Create the vector that holds the step of the induction.  */
7918   if (nested_in_vect_loop)
7919     /* iv_loop is nested in the loop to be vectorized. Generate:
7920        vec_step = [S, S, S, S]  */
7921     new_name = step_expr;
7922   else
7923     {
7924       /* iv_loop is the loop to be vectorized. Generate:
7925           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7926       gimple_seq seq = NULL;
7927       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7928         {
7929           expr = build_int_cst (integer_type_node, vf);
7930           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7931         }
7932       else
7933         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7934       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7935                                expr, step_expr);
7936       if (seq)
7937         {
7938           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7939           gcc_assert (!new_bb);
7940         }
7941     }
7942
7943   t = unshare_expr (new_name);
7944   gcc_assert (CONSTANT_CLASS_P (new_name)
7945               || TREE_CODE (new_name) == SSA_NAME);
7946   new_vec = build_vector_from_val (vectype, t);
7947   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7948
7949
7950   /* Create the following def-use cycle:
7951      loop prolog:
7952          vec_init = ...
7953          vec_step = ...
7954      loop:
7955          vec_iv = PHI <vec_init, vec_loop>
7956          ...
7957          STMT
7958          ...
7959          vec_loop = vec_iv + vec_step;  */
7960
7961   /* Create the induction-phi that defines the induction-operand.  */
7962   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7963   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7964   set_vinfo_for_stmt (induction_phi,
7965                       new_stmt_vec_info (induction_phi, loop_vinfo));
7966   induc_def = PHI_RESULT (induction_phi);
7967
7968   /* Create the iv update inside the loop  */
7969   vec_def = make_ssa_name (vec_dest);
7970   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7971   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7972   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7973
7974   /* Set the arguments of the phi node:  */
7975   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7976   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7977                UNKNOWN_LOCATION);
7978
7979   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7980
7981   /* In case that vectorization factor (VF) is bigger than the number
7982      of elements that we can fit in a vectype (nunits), we have to generate
7983      more than one vector stmt - i.e - we need to "unroll" the
7984      vector stmt by a factor VF/nunits.  For more details see documentation
7985      in vectorizable_operation.  */
7986
7987   if (ncopies > 1)
7988     {
7989       gimple_seq seq = NULL;
7990       stmt_vec_info prev_stmt_vinfo;
7991       /* FORNOW. This restriction should be relaxed.  */
7992       gcc_assert (!nested_in_vect_loop);
7993
7994       /* Create the vector that holds the step of the induction.  */
7995       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7996         {
7997           expr = build_int_cst (integer_type_node, nunits);
7998           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7999         }
8000       else
8001         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8002       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8003                                expr, step_expr);
8004       if (seq)
8005         {
8006           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8007           gcc_assert (!new_bb);
8008         }
8009
8010       t = unshare_expr (new_name);
8011       gcc_assert (CONSTANT_CLASS_P (new_name)
8012                   || TREE_CODE (new_name) == SSA_NAME);
8013       new_vec = build_vector_from_val (vectype, t);
8014       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8015
8016       vec_def = induc_def;
8017       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8018       for (i = 1; i < ncopies; i++)
8019         {
8020           /* vec_i = vec_prev + vec_step  */
8021           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8022                                           vec_def, vec_step);
8023           vec_def = make_ssa_name (vec_dest, new_stmt);
8024           gimple_assign_set_lhs (new_stmt, vec_def);
8025
8026           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8027           set_vinfo_for_stmt (new_stmt,
8028                               new_stmt_vec_info (new_stmt, loop_vinfo));
8029           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8030           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8031         }
8032     }
8033
8034   if (nested_in_vect_loop)
8035     {
8036       /* Find the loop-closed exit-phi of the induction, and record
8037          the final vector of induction results:  */
8038       exit_phi = NULL;
8039       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8040         {
8041           gimple *use_stmt = USE_STMT (use_p);
8042           if (is_gimple_debug (use_stmt))
8043             continue;
8044
8045           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8046             {
8047               exit_phi = use_stmt;
8048               break;
8049             }
8050         }
8051       if (exit_phi)
8052         {
8053           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8054           /* FORNOW. Currently not supporting the case that an inner-loop induction
8055              is not used in the outer-loop (i.e. only outside the outer-loop).  */
8056           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8057                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
8058
8059           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8060           if (dump_enabled_p ())
8061             {
8062               dump_printf_loc (MSG_NOTE, vect_location,
8063                                "vector of inductions after inner-loop:");
8064               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8065             }
8066         }
8067     }
8068
8069
8070   if (dump_enabled_p ())
8071     {
8072       dump_printf_loc (MSG_NOTE, vect_location,
8073                        "transform induction: created def-use cycle: ");
8074       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8075       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8076                         SSA_NAME_DEF_STMT (vec_def), 0);
8077     }
8078
8079   return true;
8080 }
8081
8082 /* Function vectorizable_live_operation.
8083
8084    STMT computes a value that is used outside the loop.  Check if
8085    it can be supported.  */
8086
8087 bool
8088 vectorizable_live_operation (gimple *stmt,
8089                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8090                              slp_tree slp_node, int slp_index,
8091                              gimple **vec_stmt)
8092 {
8093   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8094   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8095   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8096   imm_use_iterator imm_iter;
8097   tree lhs, lhs_type, bitsize, vec_bitsize;
8098   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8099   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8100   int ncopies;
8101   gimple *use_stmt;
8102   auto_vec<tree> vec_oprnds;
8103   int vec_entry = 0;
8104   poly_uint64 vec_index = 0;
8105
8106   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8107
8108   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8109     return false;
8110
8111   /* FORNOW.  CHECKME.  */
8112   if (nested_in_vect_loop_p (loop, stmt))
8113     return false;
8114
8115   /* If STMT is not relevant and it is a simple assignment and its inputs are
8116      invariant then it can remain in place, unvectorized.  The original last
8117      scalar value that it computes will be used.  */
8118   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8119     {
8120       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8121       if (dump_enabled_p ())
8122         dump_printf_loc (MSG_NOTE, vect_location,
8123                          "statement is simple and uses invariant.  Leaving in "
8124                          "place.\n");
8125       return true;
8126     }
8127
8128   if (slp_node)
8129     ncopies = 1;
8130   else
8131     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8132
8133   if (slp_node)
8134     {
8135       gcc_assert (slp_index >= 0);
8136
8137       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8138       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8139
8140       /* Get the last occurrence of the scalar index from the concatenation of
8141          all the slp vectors. Calculate which slp vector it is and the index
8142          within.  */
8143       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8144
8145       /* Calculate which vector contains the result, and which lane of
8146          that vector we need.  */
8147       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8148         {
8149           if (dump_enabled_p ())
8150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8151                              "Cannot determine which vector holds the"
8152                              " final result.\n");
8153           return false;
8154         }
8155     }
8156
8157   if (!vec_stmt)
8158     {
8159       /* No transformation required.  */
8160       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8161         {
8162           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8163                                                OPTIMIZE_FOR_SPEED))
8164             {
8165               if (dump_enabled_p ())
8166                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8167                                  "can't use a fully-masked loop because "
8168                                  "the target doesn't support extract last "
8169                                  "reduction.\n");
8170               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8171             }
8172           else if (slp_node)
8173             {
8174               if (dump_enabled_p ())
8175                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8176                                  "can't use a fully-masked loop because an "
8177                                  "SLP statement is live after the loop.\n");
8178               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8179             }
8180           else if (ncopies > 1)
8181             {
8182               if (dump_enabled_p ())
8183                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8184                                  "can't use a fully-masked loop because"
8185                                  " ncopies is greater than 1.\n");
8186               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8187             }
8188           else
8189             {
8190               gcc_assert (ncopies == 1 && !slp_node);
8191               vect_record_loop_mask (loop_vinfo,
8192                                      &LOOP_VINFO_MASKS (loop_vinfo),
8193                                      1, vectype);
8194             }
8195         }
8196       return true;
8197     }
8198
8199   /* If stmt has a related stmt, then use that for getting the lhs.  */
8200   if (is_pattern_stmt_p (stmt_info))
8201     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8202
8203   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8204         : gimple_get_lhs (stmt);
8205   lhs_type = TREE_TYPE (lhs);
8206
8207   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8208              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8209              : TYPE_SIZE (TREE_TYPE (vectype)));
8210   vec_bitsize = TYPE_SIZE (vectype);
8211
8212   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8213   tree vec_lhs, bitstart;
8214   if (slp_node)
8215     {
8216       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8217
8218       /* Get the correct slp vectorized stmt.  */
8219       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8220       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8221         vec_lhs = gimple_phi_result (phi);
8222       else
8223         vec_lhs = gimple_get_lhs (vec_stmt);
8224
8225       /* Get entry to use.  */
8226       bitstart = bitsize_int (vec_index);
8227       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8228     }
8229   else
8230     {
8231       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8232       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8233       gcc_checking_assert (ncopies == 1
8234                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8235
8236       /* For multiple copies, get the last copy.  */
8237       for (int i = 1; i < ncopies; ++i)
8238         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8239                                                   vec_lhs);
8240
8241       /* Get the last lane in the vector.  */
8242       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8243     }
8244
8245   gimple_seq stmts = NULL;
8246   tree new_tree;
8247   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8248     {
8249       /* Emit:
8250
8251            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8252
8253          where VEC_LHS is the vectorized live-out result and MASK is
8254          the loop mask for the final iteration.  */
8255       gcc_assert (ncopies == 1 && !slp_node);
8256       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8257       tree scalar_res = make_ssa_name (scalar_type);
8258       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8259                                       1, vectype, 0);
8260       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8261                                                     2, mask, vec_lhs);
8262       gimple_call_set_lhs (new_stmt, scalar_res);
8263       gimple_seq_add_stmt (&stmts, new_stmt);
8264
8265       /* Convert the extracted vector element to the required scalar type.  */
8266       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8267     }
8268   else
8269     {
8270       tree bftype = TREE_TYPE (vectype);
8271       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8272         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8273       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8274       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8275                                        &stmts, true, NULL_TREE);
8276     }
8277
8278   if (stmts)
8279     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8280
8281   /* Replace use of lhs with newly computed result.  If the use stmt is a
8282      single arg PHI, just replace all uses of PHI result.  It's necessary
8283      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8284   use_operand_p use_p;
8285   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8286     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8287         && !is_gimple_debug (use_stmt))
8288     {
8289       if (gimple_code (use_stmt) == GIMPLE_PHI
8290           && gimple_phi_num_args (use_stmt) == 1)
8291         {
8292           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8293         }
8294       else
8295         {
8296           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8297             SET_USE (use_p, new_tree);
8298         }
8299       update_stmt (use_stmt);
8300     }
8301
8302   return true;
8303 }
8304
8305 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8306
8307 static void
8308 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8309 {
8310   ssa_op_iter op_iter;
8311   imm_use_iterator imm_iter;
8312   def_operand_p def_p;
8313   gimple *ustmt;
8314
8315   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8316     {
8317       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8318         {
8319           basic_block bb;
8320
8321           if (!is_gimple_debug (ustmt))
8322             continue;
8323
8324           bb = gimple_bb (ustmt);
8325
8326           if (!flow_bb_inside_loop_p (loop, bb))
8327             {
8328               if (gimple_debug_bind_p (ustmt))
8329                 {
8330                   if (dump_enabled_p ())
8331                     dump_printf_loc (MSG_NOTE, vect_location,
8332                                      "killing debug use\n");
8333
8334                   gimple_debug_bind_reset_value (ustmt);
8335                   update_stmt (ustmt);
8336                 }
8337               else
8338                 gcc_unreachable ();
8339             }
8340         }
8341     }
8342 }
8343
8344 /* Given loop represented by LOOP_VINFO, return true if computation of
8345    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8346    otherwise.  */
8347
8348 static bool
8349 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8350 {
8351   /* Constant case.  */
8352   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8353     {
8354       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8355       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8356
8357       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8358       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8359       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8360         return true;
8361     }
8362
8363   widest_int max;
8364   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8365   /* Check the upper bound of loop niters.  */
8366   if (get_max_loop_iterations (loop, &max))
8367     {
8368       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8369       signop sgn = TYPE_SIGN (type);
8370       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8371       if (max < type_max)
8372         return true;
8373     }
8374   return false;
8375 }
8376
8377 /* Return a mask type with half the number of elements as TYPE.  */
8378
8379 tree
8380 vect_halve_mask_nunits (tree type)
8381 {
8382   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8383   return build_truth_vector_type (nunits, current_vector_size);
8384 }
8385
8386 /* Return a mask type with twice as many elements as TYPE.  */
8387
8388 tree
8389 vect_double_mask_nunits (tree type)
8390 {
8391   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8392   return build_truth_vector_type (nunits, current_vector_size);
8393 }
8394
8395 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8396    contain a sequence of NVECTORS masks that each control a vector of type
8397    VECTYPE.  */
8398
8399 void
8400 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8401                        unsigned int nvectors, tree vectype)
8402 {
8403   gcc_assert (nvectors != 0);
8404   if (masks->length () < nvectors)
8405     masks->safe_grow_cleared (nvectors);
8406   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8407   /* The number of scalars per iteration and the number of vectors are
8408      both compile-time constants.  */
8409   unsigned int nscalars_per_iter
8410     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8411                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8412   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8413     {
8414       rgm->max_nscalars_per_iter = nscalars_per_iter;
8415       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8416     }
8417 }
8418
8419 /* Given a complete set of masks MASKS, extract mask number INDEX
8420    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8421    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8422
8423    See the comment above vec_loop_masks for more details about the mask
8424    arrangement.  */
8425
8426 tree
8427 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8428                     unsigned int nvectors, tree vectype, unsigned int index)
8429 {
8430   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8431   tree mask_type = rgm->mask_type;
8432
8433   /* Populate the rgroup's mask array, if this is the first time we've
8434      used it.  */
8435   if (rgm->masks.is_empty ())
8436     {
8437       rgm->masks.safe_grow_cleared (nvectors);
8438       for (unsigned int i = 0; i < nvectors; ++i)
8439         {
8440           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8441           /* Provide a dummy definition until the real one is available.  */
8442           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8443           rgm->masks[i] = mask;
8444         }
8445     }
8446
8447   tree mask = rgm->masks[index];
8448   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8449                 TYPE_VECTOR_SUBPARTS (vectype)))
8450     {
8451       /* A loop mask for data type X can be reused for data type Y
8452          if X has N times more elements than Y and if Y's elements
8453          are N times bigger than X's.  In this case each sequence
8454          of N elements in the loop mask will be all-zero or all-one.
8455          We can then view-convert the mask so that each sequence of
8456          N elements is replaced by a single element.  */
8457       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8458                               TYPE_VECTOR_SUBPARTS (vectype)));
8459       gimple_seq seq = NULL;
8460       mask_type = build_same_sized_truth_vector_type (vectype);
8461       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8462       if (seq)
8463         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8464     }
8465   return mask;
8466 }
8467
8468 /* Scale profiling counters by estimation for LOOP which is vectorized
8469    by factor VF.  */
8470
8471 static void
8472 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8473 {
8474   edge preheader = loop_preheader_edge (loop);
8475   /* Reduce loop iterations by the vectorization factor.  */
8476   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8477   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8478
8479   if (freq_h.nonzero_p ())
8480     {
8481       profile_probability p;
8482
8483       /* Avoid dropping loop body profile counter to 0 because of zero count
8484          in loop's preheader.  */
8485       if (!(freq_e == profile_count::zero ()))
8486         freq_e = freq_e.force_nonzero ();
8487       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8488       scale_loop_frequencies (loop, p);
8489     }
8490
8491   edge exit_e = single_exit (loop);
8492   exit_e->probability = profile_probability::always ()
8493                                  .apply_scale (1, new_est_niter + 1);
8494
8495   edge exit_l = single_pred_edge (loop->latch);
8496   profile_probability prob = exit_l->probability;
8497   exit_l->probability = exit_e->probability.invert ();
8498   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8499     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8500 }
8501
8502 /* Function vect_transform_loop.
8503
8504    The analysis phase has determined that the loop is vectorizable.
8505    Vectorize the loop - created vectorized stmts to replace the scalar
8506    stmts in the loop, and update the loop exit condition.
8507    Returns scalar epilogue loop if any.  */
8508
8509 struct loop *
8510 vect_transform_loop (loop_vec_info loop_vinfo)
8511 {
8512   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8513   struct loop *epilogue = NULL;
8514   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8515   int nbbs = loop->num_nodes;
8516   int i;
8517   tree niters_vector = NULL_TREE;
8518   tree step_vector = NULL_TREE;
8519   tree niters_vector_mult_vf = NULL_TREE;
8520   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8521   unsigned int lowest_vf = constant_lower_bound (vf);
8522   bool grouped_store;
8523   bool slp_scheduled = false;
8524   gimple *stmt, *pattern_stmt;
8525   gimple_seq pattern_def_seq = NULL;
8526   gimple_stmt_iterator pattern_def_si = gsi_none ();
8527   bool transform_pattern_stmt = false;
8528   bool check_profitability = false;
8529   unsigned int th;
8530
8531   if (dump_enabled_p ())
8532     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8533
8534   /* Use the more conservative vectorization threshold.  If the number
8535      of iterations is constant assume the cost check has been performed
8536      by our caller.  If the threshold makes all loops profitable that
8537      run at least the (estimated) vectorization factor number of times
8538      checking is pointless, too.  */
8539   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8540   if (th >= vect_vf_for_cost (loop_vinfo)
8541       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8542     {
8543       if (dump_enabled_p ())
8544         dump_printf_loc (MSG_NOTE, vect_location,
8545                          "Profitability threshold is %d loop iterations.\n",
8546                          th);
8547       check_profitability = true;
8548     }
8549
8550   /* Make sure there exists a single-predecessor exit bb.  Do this before
8551      versioning.   */
8552   edge e = single_exit (loop);
8553   if (! single_pred_p (e->dest))
8554     {
8555       split_loop_exit_edge (e);
8556       if (dump_enabled_p ())
8557         dump_printf (MSG_NOTE, "split exit edge\n");
8558     }
8559
8560   /* Version the loop first, if required, so the profitability check
8561      comes first.  */
8562
8563   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8564     {
8565       poly_uint64 versioning_threshold
8566         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8567       if (check_profitability
8568           && ordered_p (poly_uint64 (th), versioning_threshold))
8569         {
8570           versioning_threshold = ordered_max (poly_uint64 (th),
8571                                               versioning_threshold);
8572           check_profitability = false;
8573         }
8574       vect_loop_versioning (loop_vinfo, th, check_profitability,
8575                             versioning_threshold);
8576       check_profitability = false;
8577     }
8578
8579   /* Make sure there exists a single-predecessor exit bb also on the
8580      scalar loop copy.  Do this after versioning but before peeling
8581      so CFG structure is fine for both scalar and if-converted loop
8582      to make slpeel_duplicate_current_defs_from_edges face matched
8583      loop closed PHI nodes on the exit.  */
8584   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8585     {
8586       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8587       if (! single_pred_p (e->dest))
8588         {
8589           split_loop_exit_edge (e);
8590           if (dump_enabled_p ())
8591             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8592         }
8593     }
8594
8595   tree niters = vect_build_loop_niters (loop_vinfo);
8596   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8597   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8598   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8599   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8600                               &step_vector, &niters_vector_mult_vf, th,
8601                               check_profitability, niters_no_overflow);
8602
8603   if (niters_vector == NULL_TREE)
8604     {
8605       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8606           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8607           && known_eq (lowest_vf, vf))
8608         {
8609           niters_vector
8610             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8611                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8612           step_vector = build_one_cst (TREE_TYPE (niters));
8613         }
8614       else
8615         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8616                                      &step_vector, niters_no_overflow);
8617     }
8618
8619   /* 1) Make sure the loop header has exactly two entries
8620      2) Make sure we have a preheader basic block.  */
8621
8622   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8623
8624   split_edge (loop_preheader_edge (loop));
8625
8626   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8627       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8628     /* This will deal with any possible peeling.  */
8629     vect_prepare_for_masked_peels (loop_vinfo);
8630
8631   /* FORNOW: the vectorizer supports only loops which body consist
8632      of one basic block (header + empty latch). When the vectorizer will
8633      support more involved loop forms, the order by which the BBs are
8634      traversed need to be reconsidered.  */
8635
8636   for (i = 0; i < nbbs; i++)
8637     {
8638       basic_block bb = bbs[i];
8639       stmt_vec_info stmt_info;
8640
8641       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8642            gsi_next (&si))
8643         {
8644           gphi *phi = si.phi ();
8645           if (dump_enabled_p ())
8646             {
8647               dump_printf_loc (MSG_NOTE, vect_location,
8648                                "------>vectorizing phi: ");
8649               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8650             }
8651           stmt_info = vinfo_for_stmt (phi);
8652           if (!stmt_info)
8653             continue;
8654
8655           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8656             vect_loop_kill_debug_uses (loop, phi);
8657
8658           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8659               && !STMT_VINFO_LIVE_P (stmt_info))
8660             continue;
8661
8662           if (STMT_VINFO_VECTYPE (stmt_info)
8663               && (maybe_ne
8664                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8665               && dump_enabled_p ())
8666             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8667
8668           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8669                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8670                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8671               && ! PURE_SLP_STMT (stmt_info))
8672             {
8673               if (dump_enabled_p ())
8674                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8675               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8676             }
8677         }
8678
8679       pattern_stmt = NULL;
8680       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8681            !gsi_end_p (si) || transform_pattern_stmt;)
8682         {
8683           bool is_store;
8684
8685           if (transform_pattern_stmt)
8686             stmt = pattern_stmt;
8687           else
8688             {
8689               stmt = gsi_stmt (si);
8690               /* During vectorization remove existing clobber stmts.  */
8691               if (gimple_clobber_p (stmt))
8692                 {
8693                   unlink_stmt_vdef (stmt);
8694                   gsi_remove (&si, true);
8695                   release_defs (stmt);
8696                   continue;
8697                 }
8698             }
8699
8700           if (dump_enabled_p ())
8701             {
8702               dump_printf_loc (MSG_NOTE, vect_location,
8703                                "------>vectorizing statement: ");
8704               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8705             }
8706
8707           stmt_info = vinfo_for_stmt (stmt);
8708
8709           /* vector stmts created in the outer-loop during vectorization of
8710              stmts in an inner-loop may not have a stmt_info, and do not
8711              need to be vectorized.  */
8712           if (!stmt_info)
8713             {
8714               gsi_next (&si);
8715               continue;
8716             }
8717
8718           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8719             vect_loop_kill_debug_uses (loop, stmt);
8720
8721           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8722               && !STMT_VINFO_LIVE_P (stmt_info))
8723             {
8724               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8725                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8726                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8727                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8728                 {
8729                   stmt = pattern_stmt;
8730                   stmt_info = vinfo_for_stmt (stmt);
8731                 }
8732               else
8733                 {
8734                   gsi_next (&si);
8735                   continue;
8736                 }
8737             }
8738           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8739                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8740                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8741                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8742             transform_pattern_stmt = true;
8743
8744           /* If pattern statement has def stmts, vectorize them too.  */
8745           if (is_pattern_stmt_p (stmt_info))
8746             {
8747               if (pattern_def_seq == NULL)
8748                 {
8749                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8750                   pattern_def_si = gsi_start (pattern_def_seq);
8751                 }
8752               else if (!gsi_end_p (pattern_def_si))
8753                 gsi_next (&pattern_def_si);
8754               if (pattern_def_seq != NULL)
8755                 {
8756                   gimple *pattern_def_stmt = NULL;
8757                   stmt_vec_info pattern_def_stmt_info = NULL;
8758
8759                   while (!gsi_end_p (pattern_def_si))
8760                     {
8761                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8762                       pattern_def_stmt_info
8763                         = vinfo_for_stmt (pattern_def_stmt);
8764                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8765                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8766                         break;
8767                       gsi_next (&pattern_def_si);
8768                     }
8769
8770                   if (!gsi_end_p (pattern_def_si))
8771                     {
8772                       if (dump_enabled_p ())
8773                         {
8774                           dump_printf_loc (MSG_NOTE, vect_location,
8775                                            "==> vectorizing pattern def "
8776                                            "stmt: ");
8777                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8778                                             pattern_def_stmt, 0);
8779                         }
8780
8781                       stmt = pattern_def_stmt;
8782                       stmt_info = pattern_def_stmt_info;
8783                     }
8784                   else
8785                     {
8786                       pattern_def_si = gsi_none ();
8787                       transform_pattern_stmt = false;
8788                     }
8789                 }
8790               else
8791                 transform_pattern_stmt = false;
8792             }
8793
8794           if (STMT_VINFO_VECTYPE (stmt_info))
8795             {
8796               poly_uint64 nunits
8797                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8798               if (!STMT_SLP_TYPE (stmt_info)
8799                   && maybe_ne (nunits, vf)
8800                   && dump_enabled_p ())
8801                   /* For SLP VF is set according to unrolling factor, and not
8802                      to vector size, hence for SLP this print is not valid.  */
8803                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8804             }
8805
8806           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8807              reached.  */
8808           if (STMT_SLP_TYPE (stmt_info))
8809             {
8810               if (!slp_scheduled)
8811                 {
8812                   slp_scheduled = true;
8813
8814                   if (dump_enabled_p ())
8815                     dump_printf_loc (MSG_NOTE, vect_location,
8816                                      "=== scheduling SLP instances ===\n");
8817
8818                   vect_schedule_slp (loop_vinfo);
8819                 }
8820
8821               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8822               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8823                 {
8824                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8825                     {
8826                       pattern_def_seq = NULL;
8827                       gsi_next (&si);
8828                     }
8829                   continue;
8830                 }
8831             }
8832
8833           /* -------- vectorize statement ------------ */
8834           if (dump_enabled_p ())
8835             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8836
8837           grouped_store = false;
8838           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8839           if (is_store)
8840             {
8841               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8842                 {
8843                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8844                      interleaving chain was completed - free all the stores in
8845                      the chain.  */
8846                   gsi_next (&si);
8847                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8848                 }
8849               else
8850                 {
8851                   /* Free the attached stmt_vec_info and remove the stmt.  */
8852                   gimple *store = gsi_stmt (si);
8853                   free_stmt_vec_info (store);
8854                   unlink_stmt_vdef (store);
8855                   gsi_remove (&si, true);
8856                   release_defs (store);
8857                 }
8858
8859               /* Stores can only appear at the end of pattern statements.  */
8860               gcc_assert (!transform_pattern_stmt);
8861               pattern_def_seq = NULL;
8862             }
8863           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8864             {
8865               pattern_def_seq = NULL;
8866               gsi_next (&si);
8867             }
8868         }                       /* stmts in BB */
8869
8870       /* Stub out scalar statements that must not survive vectorization.
8871          Doing this here helps with grouped statements, or statements that
8872          are involved in patterns.  */
8873       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8874            !gsi_end_p (gsi); gsi_next (&gsi))
8875         {
8876           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8877           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8878             {
8879               tree lhs = gimple_get_lhs (call);
8880               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8881                 {
8882                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8883                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8884                   gsi_replace (&gsi, new_stmt, true);
8885                 }
8886             }
8887         }
8888     }                           /* BBs in loop */
8889
8890   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8891      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8892   if (integer_onep (step_vector))
8893     niters_no_overflow = true;
8894   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8895                            niters_vector_mult_vf, !niters_no_overflow);
8896
8897   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8898   scale_profile_for_vect_loop (loop, assumed_vf);
8899
8900   /* True if the final iteration might not handle a full vector's
8901      worth of scalar iterations.  */
8902   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8903   /* The minimum number of iterations performed by the epilogue.  This
8904      is 1 when peeling for gaps because we always need a final scalar
8905      iteration.  */
8906   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8907   /* +1 to convert latch counts to loop iteration counts,
8908      -min_epilogue_iters to remove iterations that cannot be performed
8909        by the vector code.  */
8910   int bias_for_lowest = 1 - min_epilogue_iters;
8911   int bias_for_assumed = bias_for_lowest;
8912   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8913   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8914     {
8915       /* When the amount of peeling is known at compile time, the first
8916          iteration will have exactly alignment_npeels active elements.
8917          In the worst case it will have at least one.  */
8918       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8919       bias_for_lowest += lowest_vf - min_first_active;
8920       bias_for_assumed += assumed_vf - min_first_active;
8921     }
8922   /* In these calculations the "- 1" converts loop iteration counts
8923      back to latch counts.  */
8924   if (loop->any_upper_bound)
8925     loop->nb_iterations_upper_bound
8926       = (final_iter_may_be_partial
8927          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8928                           lowest_vf) - 1
8929          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8930                            lowest_vf) - 1);
8931   if (loop->any_likely_upper_bound)
8932     loop->nb_iterations_likely_upper_bound
8933       = (final_iter_may_be_partial
8934          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8935                           + bias_for_lowest, lowest_vf) - 1
8936          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8937                            + bias_for_lowest, lowest_vf) - 1);
8938   if (loop->any_estimate)
8939     loop->nb_iterations_estimate
8940       = (final_iter_may_be_partial
8941          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8942                           assumed_vf) - 1
8943          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8944                            assumed_vf) - 1);
8945
8946   if (dump_enabled_p ())
8947     {
8948       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8949         {
8950           dump_printf_loc (MSG_NOTE, vect_location,
8951                            "LOOP VECTORIZED\n");
8952           if (loop->inner)
8953             dump_printf_loc (MSG_NOTE, vect_location,
8954                              "OUTER LOOP VECTORIZED\n");
8955           dump_printf (MSG_NOTE, "\n");
8956         }
8957       else
8958         {
8959           dump_printf_loc (MSG_NOTE, vect_location,
8960                            "LOOP EPILOGUE VECTORIZED (VS=");
8961           dump_dec (MSG_NOTE, current_vector_size);
8962           dump_printf (MSG_NOTE, ")\n");
8963         }
8964     }
8965
8966   /* Free SLP instances here because otherwise stmt reference counting
8967      won't work.  */
8968   slp_instance instance;
8969   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8970     vect_free_slp_instance (instance);
8971   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8972   /* Clear-up safelen field since its value is invalid after vectorization
8973      since vectorized loop can have loop-carried dependencies.  */
8974   loop->safelen = 0;
8975
8976   /* Don't vectorize epilogue for epilogue.  */
8977   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8978     epilogue = NULL;
8979
8980   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8981     epilogue = NULL;
8982
8983   if (epilogue)
8984     {
8985       auto_vector_sizes vector_sizes;
8986       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8987       unsigned int next_size = 0;
8988
8989       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8990           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8991           && known_eq (vf, lowest_vf))
8992         {
8993           unsigned int eiters
8994             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8995                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8996           eiters = eiters % lowest_vf;
8997           epilogue->nb_iterations_upper_bound = eiters - 1;
8998
8999           unsigned int ratio;
9000           while (next_size < vector_sizes.length ()
9001                  && !(constant_multiple_p (current_vector_size,
9002                                            vector_sizes[next_size], &ratio)
9003                       && eiters >= lowest_vf / ratio))
9004             next_size += 1;
9005         }
9006       else
9007         while (next_size < vector_sizes.length ()
9008                && maybe_lt (current_vector_size, vector_sizes[next_size]))
9009           next_size += 1;
9010
9011       if (next_size == vector_sizes.length ())
9012         epilogue = NULL;
9013     }
9014
9015   if (epilogue)
9016     {
9017       epilogue->force_vectorize = loop->force_vectorize;
9018       epilogue->safelen = loop->safelen;
9019       epilogue->dont_vectorize = false;
9020
9021       /* We may need to if-convert epilogue to vectorize it.  */
9022       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9023         tree_if_conversion (epilogue);
9024     }
9025
9026   return epilogue;
9027 }
9028
9029 /* The code below is trying to perform simple optimization - revert
9030    if-conversion for masked stores, i.e. if the mask of a store is zero
9031    do not perform it and all stored value producers also if possible.
9032    For example,
9033      for (i=0; i<n; i++)
9034        if (c[i])
9035         {
9036           p1[i] += 1;
9037           p2[i] = p3[i] +2;
9038         }
9039    this transformation will produce the following semi-hammock:
9040
9041    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9042      {
9043        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9044        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9045        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9046        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9047        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9048        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9049      }
9050 */
9051
9052 void
9053 optimize_mask_stores (struct loop *loop)
9054 {
9055   basic_block *bbs = get_loop_body (loop);
9056   unsigned nbbs = loop->num_nodes;
9057   unsigned i;
9058   basic_block bb;
9059   struct loop *bb_loop;
9060   gimple_stmt_iterator gsi;
9061   gimple *stmt;
9062   auto_vec<gimple *> worklist;
9063
9064   vect_location = find_loop_location (loop);
9065   /* Pick up all masked stores in loop if any.  */
9066   for (i = 0; i < nbbs; i++)
9067     {
9068       bb = bbs[i];
9069       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9070            gsi_next (&gsi))
9071         {
9072           stmt = gsi_stmt (gsi);
9073           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9074             worklist.safe_push (stmt);
9075         }
9076     }
9077
9078   free (bbs);
9079   if (worklist.is_empty ())
9080     return;
9081
9082   /* Loop has masked stores.  */
9083   while (!worklist.is_empty ())
9084     {
9085       gimple *last, *last_store;
9086       edge e, efalse;
9087       tree mask;
9088       basic_block store_bb, join_bb;
9089       gimple_stmt_iterator gsi_to;
9090       tree vdef, new_vdef;
9091       gphi *phi;
9092       tree vectype;
9093       tree zero;
9094
9095       last = worklist.pop ();
9096       mask = gimple_call_arg (last, 2);
9097       bb = gimple_bb (last);
9098       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9099          the same loop as if_bb.  It could be different to LOOP when two
9100          level loop-nest is vectorized and mask_store belongs to the inner
9101          one.  */
9102       e = split_block (bb, last);
9103       bb_loop = bb->loop_father;
9104       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9105       join_bb = e->dest;
9106       store_bb = create_empty_bb (bb);
9107       add_bb_to_loop (store_bb, bb_loop);
9108       e->flags = EDGE_TRUE_VALUE;
9109       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9110       /* Put STORE_BB to likely part.  */
9111       efalse->probability = profile_probability::unlikely ();
9112       store_bb->count = efalse->count ();
9113       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9114       if (dom_info_available_p (CDI_DOMINATORS))
9115         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9116       if (dump_enabled_p ())
9117         dump_printf_loc (MSG_NOTE, vect_location,
9118                          "Create new block %d to sink mask stores.",
9119                          store_bb->index);
9120       /* Create vector comparison with boolean result.  */
9121       vectype = TREE_TYPE (mask);
9122       zero = build_zero_cst (vectype);
9123       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9124       gsi = gsi_last_bb (bb);
9125       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9126       /* Create new PHI node for vdef of the last masked store:
9127          .MEM_2 = VDEF <.MEM_1>
9128          will be converted to
9129          .MEM.3 = VDEF <.MEM_1>
9130          and new PHI node will be created in join bb
9131          .MEM_2 = PHI <.MEM_1, .MEM_3>
9132       */
9133       vdef = gimple_vdef (last);
9134       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9135       gimple_set_vdef (last, new_vdef);
9136       phi = create_phi_node (vdef, join_bb);
9137       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9138
9139       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9140       while (true)
9141         {
9142           gimple_stmt_iterator gsi_from;
9143           gimple *stmt1 = NULL;
9144
9145           /* Move masked store to STORE_BB.  */
9146           last_store = last;
9147           gsi = gsi_for_stmt (last);
9148           gsi_from = gsi;
9149           /* Shift GSI to the previous stmt for further traversal.  */
9150           gsi_prev (&gsi);
9151           gsi_to = gsi_start_bb (store_bb);
9152           gsi_move_before (&gsi_from, &gsi_to);
9153           /* Setup GSI_TO to the non-empty block start.  */
9154           gsi_to = gsi_start_bb (store_bb);
9155           if (dump_enabled_p ())
9156             {
9157               dump_printf_loc (MSG_NOTE, vect_location,
9158                                "Move stmt to created bb\n");
9159               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9160             }
9161           /* Move all stored value producers if possible.  */
9162           while (!gsi_end_p (gsi))
9163             {
9164               tree lhs;
9165               imm_use_iterator imm_iter;
9166               use_operand_p use_p;
9167               bool res;
9168
9169               /* Skip debug statements.  */
9170               if (is_gimple_debug (gsi_stmt (gsi)))
9171                 {
9172                   gsi_prev (&gsi);
9173                   continue;
9174                 }
9175               stmt1 = gsi_stmt (gsi);
9176               /* Do not consider statements writing to memory or having
9177                  volatile operand.  */
9178               if (gimple_vdef (stmt1)
9179                   || gimple_has_volatile_ops (stmt1))
9180                 break;
9181               gsi_from = gsi;
9182               gsi_prev (&gsi);
9183               lhs = gimple_get_lhs (stmt1);
9184               if (!lhs)
9185                 break;
9186
9187               /* LHS of vectorized stmt must be SSA_NAME.  */
9188               if (TREE_CODE (lhs) != SSA_NAME)
9189                 break;
9190
9191               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9192                 {
9193                   /* Remove dead scalar statement.  */
9194                   if (has_zero_uses (lhs))
9195                     {
9196                       gsi_remove (&gsi_from, true);
9197                       continue;
9198                     }
9199                 }
9200
9201               /* Check that LHS does not have uses outside of STORE_BB.  */
9202               res = true;
9203               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9204                 {
9205                   gimple *use_stmt;
9206                   use_stmt = USE_STMT (use_p);
9207                   if (is_gimple_debug (use_stmt))
9208                     continue;
9209                   if (gimple_bb (use_stmt) != store_bb)
9210                     {
9211                       res = false;
9212                       break;
9213                     }
9214                 }
9215               if (!res)
9216                 break;
9217
9218               if (gimple_vuse (stmt1)
9219                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9220                 break;
9221
9222               /* Can move STMT1 to STORE_BB.  */
9223               if (dump_enabled_p ())
9224                 {
9225                   dump_printf_loc (MSG_NOTE, vect_location,
9226                                    "Move stmt to created bb\n");
9227                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9228                 }
9229               gsi_move_before (&gsi_from, &gsi_to);
9230               /* Shift GSI_TO for further insertion.  */
9231               gsi_prev (&gsi_to);
9232             }
9233           /* Put other masked stores with the same mask to STORE_BB.  */
9234           if (worklist.is_empty ()
9235               || gimple_call_arg (worklist.last (), 2) != mask
9236               || worklist.last () != stmt1)
9237             break;
9238           last = worklist.pop ();
9239         }
9240       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9241     }
9242 }