gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156
 157 /* Function vect_determine_vectorization_factor
 158
 159    Determine the vectorization factor (VF).  VF is the number of data elements
 160    that are operated upon in parallel in a single iteration of the vectorized
 161    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 162    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 163    elements can fit in a single vector register.
 164
 165    We currently support vectorization of loops in which all types operated upon
 166    are of the same size.  Therefore this function currently sets VF according to
 167    the size of the types operated upon, and fails if there are multiple sizes
 168    in the loop.
 169
 170    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 171    original loop:
 172         for (i=0; i<N; i++){
 173           a[i] = b[i] + c[i];
 174         }
 175
 176    vectorized loop:
 177         for (i=0; i<N; i+=VF){
 178           a[i:VF] = b[i:VF] + c[i:VF];
 179         }
 180 */
 181
 182 static bool
 183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 187   unsigned nbbs = loop->num_nodes;
 188   poly_uint64 vectorization_factor = 1;
 189   tree scalar_type = NULL_TREE;
 190   gphi *phi;
 191   tree vectype;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               if (dump_enabled_p ())
 261                 {
 262                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 263                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 264                   dump_printf (MSG_NOTE, "\n");
 265                 }
 266
 267               vect_update_max_nunits (&vectorization_factor, vectype);
 268             }
 269         }
 270
 271       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 272            !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288             }
 289
 290           gcc_assert (stmt_info);
 291
 292           /* Skip stmts which do not need to be vectorized.  */
 293           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 294                && !STMT_VINFO_LIVE_P (stmt_info))
 295               || gimple_clobber_p (stmt))
 296             {
 297               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 298                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 299                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 300                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 301                 {
 302                   stmt = pattern_stmt;
 303                   stmt_info = vinfo_for_stmt (pattern_stmt);
 304                   if (dump_enabled_p ())
 305                     {
 306                       dump_printf_loc (MSG_NOTE, vect_location,
 307                                        "==> examining pattern statement: ");
 308                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 309                     }
 310                 }
 311               else
 312                 {
 313                   if (dump_enabled_p ())
 314                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 315                   gsi_next (&si);
 316                   continue;
 317                 }
 318             }
 319           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 320                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 321                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 322                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 323             analyze_pattern_stmt = true;
 324
 325           /* If a pattern statement has def stmts, analyze them too.  */
 326           if (is_pattern_stmt_p (stmt_info))
 327             {
 328               if (pattern_def_seq == NULL)
 329                 {
 330                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 331                   pattern_def_si = gsi_start (pattern_def_seq);
 332                 }
 333               else if (!gsi_end_p (pattern_def_si))
 334                 gsi_next (&pattern_def_si);
 335               if (pattern_def_seq != NULL)
 336                 {
 337                   gimple *pattern_def_stmt = NULL;
 338                   stmt_vec_info pattern_def_stmt_info = NULL;
 339
 340                   while (!gsi_end_p (pattern_def_si))
 341                     {
 342                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 343                       pattern_def_stmt_info
 344                         = vinfo_for_stmt (pattern_def_stmt);
 345                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 346                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 347                         break;
 348                       gsi_next (&pattern_def_si);
 349                     }
 350
 351                   if (!gsi_end_p (pattern_def_si))
 352                     {
 353                       if (dump_enabled_p ())
 354                         {
 355                           dump_printf_loc (MSG_NOTE, vect_location,
 356                                            "==> examining pattern def stmt: ");
 357                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 358                                             pattern_def_stmt, 0);
 359                         }
 360
 361                       stmt = pattern_def_stmt;
 362                       stmt_info = pattern_def_stmt_info;
 363                     }
 364                   else
 365                     {
 366                       pattern_def_si = gsi_none ();
 367                       analyze_pattern_stmt = false;
 368                     }
 369                 }
 370               else
 371                 analyze_pattern_stmt = false;
 372             }
 373
 374           if (gimple_get_lhs (stmt) == NULL_TREE
 375               /* MASK_STORE has no lhs, but is ok.  */
 376               && (!is_gimple_call (stmt)
 377                   || !gimple_call_internal_p (stmt)
 378                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 379             {
 380               if (is_gimple_call (stmt))
 381                 {
 382                   /* Ignore calls with no lhs.  These must be calls to
 383                      #pragma omp simd functions, and what vectorization factor
 384                      it really needs can't be determined until
 385                      vectorizable_simd_clone_call.  */
 386                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 387                     {
 388                       pattern_def_seq = NULL;
 389                       gsi_next (&si);
 390                     }
 391                   continue;
 392                 }
 393               if (dump_enabled_p ())
 394                 {
 395                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 396                                    "not vectorized: irregular stmt.");
 397                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 398                                     0);
 399                 }
 400               return false;
 401             }
 402
 403           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 404             {
 405               if (dump_enabled_p ())
 406                 {
 407                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                                    "not vectorized: vector stmt in loop:");
 409                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 431                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 432               else
 433                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 434
 435               /* Bool ops don't participate in vectorization factor
 436                  computation.  For comparison use compared types to
 437                  compute a factor.  */
 438               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 439                   && is_gimple_assign (stmt)
 440                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 441                 {
 442                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 443                       || STMT_VINFO_LIVE_P (stmt_info))
 444                     mask_producers.safe_push (stmt_info);
 445                   bool_result = true;
 446
 447                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 448                       == tcc_comparison
 449                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 450                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 451                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 452                   else
 453                     {
 454                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 455                         {
 456                           pattern_def_seq = NULL;
 457                           gsi_next (&si);
 458                         }
 459                       continue;
 460                     }
 461                 }
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location,
 466                                    "get vectype for scalar type:  ");
 467                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 468                   dump_printf (MSG_NOTE, "\n");
 469                 }
 470               vectype = get_vectype_for_scalar_type (scalar_type);
 471               if (!vectype)
 472                 {
 473                   if (dump_enabled_p ())
 474                     {
 475                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 476                                        "not vectorized: unsupported "
 477                                        "data-type ");
 478                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 479                                          scalar_type);
 480                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 481                     }
 482                   return false;
 483                 }
 484
 485               if (!bool_result)
 486                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 487
 488               if (dump_enabled_p ())
 489                 {
 490                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 491                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 492                   dump_printf (MSG_NOTE, "\n");
 493                 }
 494             }
 495
 496           /* Don't try to compute VF out scalar types if we stmt
 497              produces boolean vector.  Use result vectype instead.  */
 498           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 499             vf_vectype = vectype;
 500           else
 501             {
 502               /* The vectorization factor is according to the smallest
 503                  scalar type (or the largest vector size, but we only
 504                  support one vector size per loop).  */
 505               if (!bool_result)
 506                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                              &dummy);
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_NOTE, vect_location,
 511                                    "get vectype for scalar type:  ");
 512                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513                   dump_printf (MSG_NOTE, "\n");
 514                 }
 515               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516             }
 517           if (!vf_vectype)
 518             {
 519               if (dump_enabled_p ())
 520                 {
 521                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 522                                    "not vectorized: unsupported data-type ");
 523                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 524                                      scalar_type);
 525                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 526                 }
 527               return false;
 528             }
 529
 530           if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
 531                         GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 532             {
 533               if (dump_enabled_p ())
 534                 {
 535                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                                    "not vectorized: different sized vector "
 537                                    "types in statement, ");
 538                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 539                                      vectype);
 540                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 541                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 542                                      vf_vectype);
 543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 544                 }
 545               return false;
 546             }
 547
 548           if (dump_enabled_p ())
 549             {
 550               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 551               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 552               dump_printf (MSG_NOTE, "\n");
 553             }
 554
 555           if (dump_enabled_p ())
 556             {
 557               dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 558               dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
 559               dump_printf (MSG_NOTE, "\n");
 560             }
 561
 562           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 563
 564           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 565             {
 566               pattern_def_seq = NULL;
 567               gsi_next (&si);
 568             }
 569         }
 570     }
 571
 572   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 573   if (dump_enabled_p ())
 574     {
 575       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 576       dump_dec (MSG_NOTE, vectorization_factor);
 577       dump_printf (MSG_NOTE, "\n");
 578     }
 579
 580   if (known_le (vectorization_factor, 1U))
 581     {
 582       if (dump_enabled_p ())
 583         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 584                          "not vectorized: unsupported data-type\n");
 585       return false;
 586     }
 587   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 588
 589   for (i = 0; i < mask_producers.length (); i++)
 590     {
 591       tree mask_type = NULL;
 592
 593       stmt = STMT_VINFO_STMT (mask_producers[i]);
 594
 595       if (is_gimple_assign (stmt)
 596           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 597           && !VECT_SCALAR_BOOLEAN_TYPE_P
 598                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 599         {
 600           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 601           mask_type = get_mask_type_for_scalar_type (scalar_type);
 602
 603           if (!mask_type)
 604             {
 605               if (dump_enabled_p ())
 606                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                                  "not vectorized: unsupported mask\n");
 608               return false;
 609             }
 610         }
 611       else
 612         {
 613           tree rhs;
 614           ssa_op_iter iter;
 615           gimple *def_stmt;
 616           enum vect_def_type dt;
 617
 618           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 619             {
 620               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 621                                        &def_stmt, &dt, &vectype))
 622                 {
 623                   if (dump_enabled_p ())
 624                     {
 625                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 626                                        "not vectorized: can't compute mask type "
 627                                        "for statement, ");
 628                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 629                                         0);
 630                     }
 631                   return false;
 632                 }
 633
 634               /* No vectype probably means external definition.
 635                  Allow it in case there is another operand which
 636                  allows to determine mask type.  */
 637               if (!vectype)
 638                 continue;
 639
 640               if (!mask_type)
 641                 mask_type = vectype;
 642               else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
 643                                  TYPE_VECTOR_SUBPARTS (vectype)))
 644                 {
 645                   if (dump_enabled_p ())
 646                     {
 647                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 648                                        "not vectorized: different sized masks "
 649                                        "types in statement, ");
 650                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 651                                          mask_type);
 652                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 653                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 654                                          vectype);
 655                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 656                     }
 657                   return false;
 658                 }
 659               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 660                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 661                 {
 662                   if (dump_enabled_p ())
 663                     {
 664                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 665                                        "not vectorized: mixed mask and "
 666                                        "nonmask vector types in statement, ");
 667                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 668                                          mask_type);
 669                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 670                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 671                                          vectype);
 672                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 673                     }
 674                   return false;
 675                 }
 676             }
 677
 678           /* We may compare boolean value loaded as vector of integers.
 679              Fix mask_type in such case.  */
 680           if (mask_type
 681               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 682               && gimple_code (stmt) == GIMPLE_ASSIGN
 683               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 684             mask_type = build_same_sized_truth_vector_type (mask_type);
 685         }
 686
 687       /* No mask_type should mean loop invariant predicate.
 688          This is probably a subject for optimization in
 689          if-conversion.  */
 690       if (!mask_type)
 691         {
 692           if (dump_enabled_p ())
 693             {
 694               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 695                                "not vectorized: can't compute mask type "
 696                                "for statement, ");
 697               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 698                                 0);
 699             }
 700           return false;
 701         }
 702
 703       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 704     }
 705
 706   return true;
 707 }
 708
 709
 710 /* Function vect_is_simple_iv_evolution.
 711
 712    FORNOW: A simple evolution of an induction variables in the loop is
 713    considered a polynomial evolution.  */
 714
 715 static bool
 716 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 717                              tree * step)
 718 {
 719   tree init_expr;
 720   tree step_expr;
 721   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 722   basic_block bb;
 723
 724   /* When there is no evolution in this loop, the evolution function
 725      is not "simple".  */
 726   if (evolution_part == NULL_TREE)
 727     return false;
 728
 729   /* When the evolution is a polynomial of degree >= 2
 730      the evolution function is not "simple".  */
 731   if (tree_is_chrec (evolution_part))
 732     return false;
 733
 734   step_expr = evolution_part;
 735   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 736
 737   if (dump_enabled_p ())
 738     {
 739       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 740       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 741       dump_printf (MSG_NOTE, ",  init: ");
 742       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 743       dump_printf (MSG_NOTE, "\n");
 744     }
 745
 746   *init = init_expr;
 747   *step = step_expr;
 748
 749   if (TREE_CODE (step_expr) != INTEGER_CST
 750       && (TREE_CODE (step_expr) != SSA_NAME
 751           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 752               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 753           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 754               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 755                   || !flag_associative_math)))
 756       && (TREE_CODE (step_expr) != REAL_CST
 757           || !flag_associative_math))
 758     {
 759       if (dump_enabled_p ())
 760         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 761                          "step unknown.\n");
 762       return false;
 763     }
 764
 765   return true;
 766 }
 767
 768 /* Function vect_analyze_scalar_cycles_1.
 769
 770    Examine the cross iteration def-use cycles of scalar variables
 771    in LOOP.  LOOP_VINFO represents the loop that is now being
 772    considered for vectorization (can be LOOP, or an outer-loop
 773    enclosing LOOP).  */
 774
 775 static void
 776 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 777 {
 778   basic_block bb = loop->header;
 779   tree init, step;
 780   auto_vec<gimple *, 64> worklist;
 781   gphi_iterator gsi;
 782   bool double_reduc;
 783
 784   if (dump_enabled_p ())
 785     dump_printf_loc (MSG_NOTE, vect_location,
 786                      "=== vect_analyze_scalar_cycles ===\n");
 787
 788   /* First - identify all inductions.  Reduction detection assumes that all the
 789      inductions have been identified, therefore, this order must not be
 790      changed.  */
 791   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 792     {
 793       gphi *phi = gsi.phi ();
 794       tree access_fn = NULL;
 795       tree def = PHI_RESULT (phi);
 796       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 797
 798       if (dump_enabled_p ())
 799         {
 800           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 801           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 802         }
 803
 804       /* Skip virtual phi's.  The data dependences that are associated with
 805          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 806       if (virtual_operand_p (def))
 807         continue;
 808
 809       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 810
 811       /* Analyze the evolution function.  */
 812       access_fn = analyze_scalar_evolution (loop, def);
 813       if (access_fn)
 814         {
 815           STRIP_NOPS (access_fn);
 816           if (dump_enabled_p ())
 817             {
 818               dump_printf_loc (MSG_NOTE, vect_location,
 819                                "Access function of PHI: ");
 820               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 821               dump_printf (MSG_NOTE, "\n");
 822             }
 823           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 824             = initial_condition_in_loop_num (access_fn, loop->num);
 825           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 826             = evolution_part_in_loop_num (access_fn, loop->num);
 827         }
 828
 829       if (!access_fn
 830           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 831           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 832               && TREE_CODE (step) != INTEGER_CST))
 833         {
 834           worklist.safe_push (phi);
 835           continue;
 836         }
 837
 838       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 839                   != NULL_TREE);
 840       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 841
 842       if (dump_enabled_p ())
 843         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 844       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 845     }
 846
 847
 848   /* Second - identify all reductions and nested cycles.  */
 849   while (worklist.length () > 0)
 850     {
 851       gimple *phi = worklist.pop ();
 852       tree def = PHI_RESULT (phi);
 853       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 854       gimple *reduc_stmt;
 855
 856       if (dump_enabled_p ())
 857         {
 858           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 859           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 860         }
 861
 862       gcc_assert (!virtual_operand_p (def)
 863                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 864
 865       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 866                                                 &double_reduc, false);
 867       if (reduc_stmt)
 868         {
 869           if (double_reduc)
 870             {
 871               if (dump_enabled_p ())
 872                 dump_printf_loc (MSG_NOTE, vect_location,
 873                                  "Detected double reduction.\n");
 874
 875               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 876               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 877                                                     vect_double_reduction_def;
 878             }
 879           else
 880             {
 881               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 882                 {
 883                   if (dump_enabled_p ())
 884                     dump_printf_loc (MSG_NOTE, vect_location,
 885                                      "Detected vectorizable nested cycle.\n");
 886
 887                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 888                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 889                                                              vect_nested_cycle;
 890                 }
 891               else
 892                 {
 893                   if (dump_enabled_p ())
 894                     dump_printf_loc (MSG_NOTE, vect_location,
 895                                      "Detected reduction.\n");
 896
 897                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 898                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 899                                                            vect_reduction_def;
 900                   /* Store the reduction cycles for possible vectorization in
 901                      loop-aware SLP if it was not detected as reduction
 902                      chain.  */
 903                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 904                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 905                 }
 906             }
 907         }
 908       else
 909         if (dump_enabled_p ())
 910           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 911                            "Unknown def-use cycle pattern.\n");
 912     }
 913 }
 914
 915
 916 /* Function vect_analyze_scalar_cycles.
 917
 918    Examine the cross iteration def-use cycles of scalar variables, by
 919    analyzing the loop-header PHIs of scalar variables.  Classify each
 920    cycle as one of the following: invariant, induction, reduction, unknown.
 921    We do that for the loop represented by LOOP_VINFO, and also to its
 922    inner-loop, if exists.
 923    Examples for scalar cycles:
 924
 925    Example1: reduction:
 926
 927               loop1:
 928               for (i=0; i<N; i++)
 929                  sum += a[i];
 930
 931    Example2: induction:
 932
 933               loop2:
 934               for (i=0; i<N; i++)
 935                  a[i] = i;  */
 936
 937 static void
 938 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 939 {
 940   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 941
 942   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 943
 944   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 945      Reductions in such inner-loop therefore have different properties than
 946      the reductions in the nest that gets vectorized:
 947      1. When vectorized, they are executed in the same order as in the original
 948         scalar loop, so we can't change the order of computation when
 949         vectorizing them.
 950      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 951         current checks are too strict.  */
 952
 953   if (loop->inner)
 954     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 955 }
 956
 957 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 958
 959 static void
 960 vect_fixup_reduc_chain (gimple *stmt)
 961 {
 962   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 963   gimple *stmtp;
 964   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 965               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 966   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 967   do
 968     {
 969       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 970       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 971       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 972       if (stmt)
 973         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 974           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 975     }
 976   while (stmt);
 977   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 978 }
 979
 980 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 981
 982 static void
 983 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 984 {
 985   gimple *first;
 986   unsigned i;
 987
 988   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 989     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 990       {
 991         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 992         while (next)
 993           {
 994             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 995               break;
 996             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 997           }
 998         /* If not all stmt in the chain are patterns try to handle
 999            the chain without patterns.  */
1000         if (! next)
1001           {
1002             vect_fixup_reduc_chain (first);
1003             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1004               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1005           }
1006       }
1007 }
1008
1009 /* Function vect_get_loop_niters.
1010
1011    Determine how many iterations the loop is executed and place it
1012    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1013    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1014    niter information holds in ASSUMPTIONS.
1015
1016    Return the loop exit condition.  */
1017
1018
1019 static gcond *
1020 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1021                       tree *number_of_iterations, tree *number_of_iterationsm1)
1022 {
1023   edge exit = single_exit (loop);
1024   struct tree_niter_desc niter_desc;
1025   tree niter_assumptions, niter, may_be_zero;
1026   gcond *cond = get_loop_exit_condition (loop);
1027
1028   *assumptions = boolean_true_node;
1029   *number_of_iterationsm1 = chrec_dont_know;
1030   *number_of_iterations = chrec_dont_know;
1031   if (dump_enabled_p ())
1032     dump_printf_loc (MSG_NOTE, vect_location,
1033                      "=== get_loop_niters ===\n");
1034
1035   if (!exit)
1036     return cond;
1037
1038   niter = chrec_dont_know;
1039   may_be_zero = NULL_TREE;
1040   niter_assumptions = boolean_true_node;
1041   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1042       || chrec_contains_undetermined (niter_desc.niter))
1043     return cond;
1044
1045   niter_assumptions = niter_desc.assumptions;
1046   may_be_zero = niter_desc.may_be_zero;
1047   niter = niter_desc.niter;
1048
1049   if (may_be_zero && integer_zerop (may_be_zero))
1050     may_be_zero = NULL_TREE;
1051
1052   if (may_be_zero)
1053     {
1054       if (COMPARISON_CLASS_P (may_be_zero))
1055         {
1056           /* Try to combine may_be_zero with assumptions, this can simplify
1057              computation of niter expression.  */
1058           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1059             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1060                                              niter_assumptions,
1061                                              fold_build1 (TRUTH_NOT_EXPR,
1062                                                           boolean_type_node,
1063                                                           may_be_zero));
1064           else
1065             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1066                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1067
1068           may_be_zero = NULL_TREE;
1069         }
1070       else if (integer_nonzerop (may_be_zero))
1071         {
1072           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1073           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1074           return cond;
1075         }
1076       else
1077         return cond;
1078     }
1079
1080   *assumptions = niter_assumptions;
1081   *number_of_iterationsm1 = niter;
1082
1083   /* We want the number of loop header executions which is the number
1084      of latch executions plus one.
1085      ???  For UINT_MAX latch executions this number overflows to zero
1086      for loops like do { n++; } while (n != 0);  */
1087   if (niter && !chrec_contains_undetermined (niter))
1088     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1089                           build_int_cst (TREE_TYPE (niter), 1));
1090   *number_of_iterations = niter;
1091
1092   return cond;
1093 }
1094
1095 /* Function bb_in_loop_p
1096
1097    Used as predicate for dfs order traversal of the loop bbs.  */
1098
1099 static bool
1100 bb_in_loop_p (const_basic_block bb, const void *data)
1101 {
1102   const struct loop *const loop = (const struct loop *)data;
1103   if (flow_bb_inside_loop_p (loop, bb))
1104     return true;
1105   return false;
1106 }
1107
1108
1109 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1110    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1111
1112 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1113   : vec_info (vec_info::loop, init_cost (loop_in)),
1114     loop (loop_in),
1115     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1116     num_itersm1 (NULL_TREE),
1117     num_iters (NULL_TREE),
1118     num_iters_unchanged (NULL_TREE),
1119     num_iters_assumptions (NULL_TREE),
1120     th (0),
1121     versioning_threshold (0),
1122     vectorization_factor (0),
1123     max_vectorization_factor (0),
1124     mask_skip_niters (NULL_TREE),
1125     mask_compare_type (NULL_TREE),
1126     unaligned_dr (NULL),
1127     peeling_for_alignment (0),
1128     ptr_mask (0),
1129     slp_unrolling_factor (1),
1130     single_scalar_iteration_cost (0),
1131     vectorizable (false),
1132     can_fully_mask_p (true),
1133     fully_masked_p (false),
1134     peeling_for_gaps (false),
1135     peeling_for_niter (false),
1136     operands_swapped (false),
1137     no_data_dependencies (false),
1138     has_mask_store (false),
1139     scalar_loop (NULL),
1140     orig_loop_info (NULL)
1141 {
1142   /* Create/Update stmt_info for all stmts in the loop.  */
1143   basic_block *body = get_loop_body (loop);
1144   for (unsigned int i = 0; i < loop->num_nodes; i++)
1145     {
1146       basic_block bb = body[i];
1147       gimple_stmt_iterator si;
1148
1149       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1150         {
1151           gimple *phi = gsi_stmt (si);
1152           gimple_set_uid (phi, 0);
1153           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1154         }
1155
1156       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1157         {
1158           gimple *stmt = gsi_stmt (si);
1159           gimple_set_uid (stmt, 0);
1160           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1161         }
1162     }
1163   free (body);
1164
1165   /* CHECKME: We want to visit all BBs before their successors (except for
1166      latch blocks, for which this assertion wouldn't hold).  In the simple
1167      case of the loop forms we allow, a dfs order of the BBs would the same
1168      as reversed postorder traversal, so we are safe.  */
1169
1170   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1171                                           bbs, loop->num_nodes, loop);
1172   gcc_assert (nbbs == loop->num_nodes);
1173 }
1174
1175 /* Free all levels of MASKS.  */
1176
1177 void
1178 release_vec_loop_masks (vec_loop_masks *masks)
1179 {
1180   rgroup_masks *rgm;
1181   unsigned int i;
1182   FOR_EACH_VEC_ELT (*masks, i, rgm)
1183     rgm->masks.release ();
1184   masks->release ();
1185 }
1186
1187 /* Free all memory used by the _loop_vec_info, as well as all the
1188    stmt_vec_info structs of all the stmts in the loop.  */
1189
1190 _loop_vec_info::~_loop_vec_info ()
1191 {
1192   int nbbs;
1193   gimple_stmt_iterator si;
1194   int j;
1195
1196   nbbs = loop->num_nodes;
1197   for (j = 0; j < nbbs; j++)
1198     {
1199       basic_block bb = bbs[j];
1200       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1201         free_stmt_vec_info (gsi_stmt (si));
1202
1203       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1204         {
1205           gimple *stmt = gsi_stmt (si);
1206
1207           /* We may have broken canonical form by moving a constant
1208              into RHS1 of a commutative op.  Fix such occurrences.  */
1209           if (operands_swapped && is_gimple_assign (stmt))
1210             {
1211               enum tree_code code = gimple_assign_rhs_code (stmt);
1212
1213               if ((code == PLUS_EXPR
1214                    || code == POINTER_PLUS_EXPR
1215                    || code == MULT_EXPR)
1216                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1217                 swap_ssa_operands (stmt,
1218                                    gimple_assign_rhs1_ptr (stmt),
1219                                    gimple_assign_rhs2_ptr (stmt));
1220               else if (code == COND_EXPR
1221                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1222                 {
1223                   tree cond_expr = gimple_assign_rhs1 (stmt);
1224                   enum tree_code cond_code = TREE_CODE (cond_expr);
1225
1226                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1227                     {
1228                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1229                                                                   0));
1230                       cond_code = invert_tree_comparison (cond_code,
1231                                                           honor_nans);
1232                       if (cond_code != ERROR_MARK)
1233                         {
1234                           TREE_SET_CODE (cond_expr, cond_code);
1235                           swap_ssa_operands (stmt,
1236                                              gimple_assign_rhs2_ptr (stmt),
1237                                              gimple_assign_rhs3_ptr (stmt));
1238                         }
1239                     }
1240                 }
1241             }
1242
1243           /* Free stmt_vec_info.  */
1244           free_stmt_vec_info (stmt);
1245           gsi_next (&si);
1246         }
1247     }
1248
1249   free (bbs);
1250
1251   release_vec_loop_masks (&masks);
1252
1253   loop->aux = NULL;
1254 }
1255
1256 /* Return true if we can use CMP_TYPE as the comparison type to produce
1257    all masks required to mask LOOP_VINFO.  */
1258
1259 static bool
1260 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1261 {
1262   rgroup_masks *rgm;
1263   unsigned int i;
1264   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1265     if (rgm->mask_type != NULL_TREE
1266         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1267                                             cmp_type, rgm->mask_type,
1268                                             OPTIMIZE_FOR_SPEED))
1269       return false;
1270   return true;
1271 }
1272
1273 /* Calculate the maximum number of scalars per iteration for every
1274    rgroup in LOOP_VINFO.  */
1275
1276 static unsigned int
1277 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1278 {
1279   unsigned int res = 1;
1280   unsigned int i;
1281   rgroup_masks *rgm;
1282   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1283     res = MAX (res, rgm->max_nscalars_per_iter);
1284   return res;
1285 }
1286
1287 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1288    whether we can actually generate the masks required.  Return true if so,
1289    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1290
1291 static bool
1292 vect_verify_full_masking (loop_vec_info loop_vinfo)
1293 {
1294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1295   unsigned int min_ni_width;
1296
1297   /* Use a normal loop if there are no statements that need masking.
1298      This only happens in rare degenerate cases: it means that the loop
1299      has no loads, no stores, and no live-out values.  */
1300   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1301     return false;
1302
1303   /* Get the maximum number of iterations that is representable
1304      in the counter type.  */
1305   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1306   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1307
1308   /* Get a more refined estimate for the number of iterations.  */
1309   widest_int max_back_edges;
1310   if (max_loop_iterations (loop, &max_back_edges))
1311     max_ni = wi::smin (max_ni, max_back_edges + 1);
1312
1313   /* Account for rgroup masks, in which each bit is replicated N times.  */
1314   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1315
1316   /* Work out how many bits we need to represent the limit.  */
1317   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1318
1319   /* Find a scalar mode for which WHILE_ULT is supported.  */
1320   opt_scalar_int_mode cmp_mode_iter;
1321   tree cmp_type = NULL_TREE;
1322   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1323     {
1324       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1325       if (cmp_bits >= min_ni_width
1326           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1327         {
1328           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1329           if (this_type
1330               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1331             {
1332               /* Although we could stop as soon as we find a valid mode,
1333                  it's often better to continue until we hit Pmode, since the
1334                  operands to the WHILE are more likely to be reusable in
1335                  address calculations.  */
1336               cmp_type = this_type;
1337               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1338                 break;
1339             }
1340         }
1341     }
1342
1343   if (!cmp_type)
1344     return false;
1345
1346   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1347   return true;
1348 }
1349
1350 /* Calculate the cost of one scalar iteration of the loop.  */
1351 static void
1352 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1353 {
1354   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1355   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1356   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1357   int innerloop_iters, i;
1358
1359   /* Count statements in scalar loop.  Using this as scalar cost for a single
1360      iteration for now.
1361
1362      TODO: Add outer loop support.
1363
1364      TODO: Consider assigning different costs to different scalar
1365      statements.  */
1366
1367   /* FORNOW.  */
1368   innerloop_iters = 1;
1369   if (loop->inner)
1370     innerloop_iters = 50; /* FIXME */
1371
1372   for (i = 0; i < nbbs; i++)
1373     {
1374       gimple_stmt_iterator si;
1375       basic_block bb = bbs[i];
1376
1377       if (bb->loop_father == loop->inner)
1378         factor = innerloop_iters;
1379       else
1380         factor = 1;
1381
1382       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1383         {
1384           gimple *stmt = gsi_stmt (si);
1385           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1386
1387           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1388             continue;
1389
1390           /* Skip stmts that are not vectorized inside the loop.  */
1391           if (stmt_info
1392               && !STMT_VINFO_RELEVANT_P (stmt_info)
1393               && (!STMT_VINFO_LIVE_P (stmt_info)
1394                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1395               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1396             continue;
1397
1398           vect_cost_for_stmt kind;
1399           if (STMT_VINFO_DATA_REF (stmt_info))
1400             {
1401               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1402                kind = scalar_load;
1403              else
1404                kind = scalar_store;
1405             }
1406           else
1407             kind = scalar_stmt;
1408
1409           scalar_single_iter_cost
1410             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1411                                  factor, kind, stmt_info, 0, vect_prologue);
1412         }
1413     }
1414   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1415     = scalar_single_iter_cost;
1416 }
1417
1418
1419 /* Function vect_analyze_loop_form_1.
1420
1421    Verify that certain CFG restrictions hold, including:
1422    - the loop has a pre-header
1423    - the loop has a single entry and exit
1424    - the loop exit condition is simple enough
1425    - the number of iterations can be analyzed, i.e, a countable loop.  The
1426      niter could be analyzed under some assumptions.  */
1427
1428 bool
1429 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1430                           tree *assumptions, tree *number_of_iterationsm1,
1431                           tree *number_of_iterations, gcond **inner_loop_cond)
1432 {
1433   if (dump_enabled_p ())
1434     dump_printf_loc (MSG_NOTE, vect_location,
1435                      "=== vect_analyze_loop_form ===\n");
1436
1437   /* Different restrictions apply when we are considering an inner-most loop,
1438      vs. an outer (nested) loop.
1439      (FORNOW. May want to relax some of these restrictions in the future).  */
1440
1441   if (!loop->inner)
1442     {
1443       /* Inner-most loop.  We currently require that the number of BBs is
1444          exactly 2 (the header and latch).  Vectorizable inner-most loops
1445          look like this:
1446
1447                         (pre-header)
1448                            |
1449                           header <--------+
1450                            | |            |
1451                            | +--> latch --+
1452                            |
1453                         (exit-bb)  */
1454
1455       if (loop->num_nodes != 2)
1456         {
1457           if (dump_enabled_p ())
1458             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                              "not vectorized: control flow in loop.\n");
1460           return false;
1461         }
1462
1463       if (empty_block_p (loop->header))
1464         {
1465           if (dump_enabled_p ())
1466             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1467                              "not vectorized: empty loop.\n");
1468           return false;
1469         }
1470     }
1471   else
1472     {
1473       struct loop *innerloop = loop->inner;
1474       edge entryedge;
1475
1476       /* Nested loop. We currently require that the loop is doubly-nested,
1477          contains a single inner loop, and the number of BBs is exactly 5.
1478          Vectorizable outer-loops look like this:
1479
1480                         (pre-header)
1481                            |
1482                           header <---+
1483                            |         |
1484                           inner-loop |
1485                            |         |
1486                           tail ------+
1487                            |
1488                         (exit-bb)
1489
1490          The inner-loop has the properties expected of inner-most loops
1491          as described above.  */
1492
1493       if ((loop->inner)->inner || (loop->inner)->next)
1494         {
1495           if (dump_enabled_p ())
1496             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1497                              "not vectorized: multiple nested loops.\n");
1498           return false;
1499         }
1500
1501       if (loop->num_nodes != 5)
1502         {
1503           if (dump_enabled_p ())
1504             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1505                              "not vectorized: control flow in loop.\n");
1506           return false;
1507         }
1508
1509       entryedge = loop_preheader_edge (innerloop);
1510       if (entryedge->src != loop->header
1511           || !single_exit (innerloop)
1512           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1513         {
1514           if (dump_enabled_p ())
1515             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1516                              "not vectorized: unsupported outerloop form.\n");
1517           return false;
1518         }
1519
1520       /* Analyze the inner-loop.  */
1521       tree inner_niterm1, inner_niter, inner_assumptions;
1522       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1523                                       &inner_assumptions, &inner_niterm1,
1524                                       &inner_niter, NULL)
1525           /* Don't support analyzing niter under assumptions for inner
1526              loop.  */
1527           || !integer_onep (inner_assumptions))
1528         {
1529           if (dump_enabled_p ())
1530             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1531                              "not vectorized: Bad inner loop.\n");
1532           return false;
1533         }
1534
1535       if (!expr_invariant_in_loop_p (loop, inner_niter))
1536         {
1537           if (dump_enabled_p ())
1538             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1539                              "not vectorized: inner-loop count not"
1540                              " invariant.\n");
1541           return false;
1542         }
1543
1544       if (dump_enabled_p ())
1545         dump_printf_loc (MSG_NOTE, vect_location,
1546                          "Considering outer-loop vectorization.\n");
1547     }
1548
1549   if (!single_exit (loop)
1550       || EDGE_COUNT (loop->header->preds) != 2)
1551     {
1552       if (dump_enabled_p ())
1553         {
1554           if (!single_exit (loop))
1555             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556                              "not vectorized: multiple exits.\n");
1557           else if (EDGE_COUNT (loop->header->preds) != 2)
1558             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1559                              "not vectorized: too many incoming edges.\n");
1560         }
1561       return false;
1562     }
1563
1564   /* We assume that the loop exit condition is at the end of the loop. i.e,
1565      that the loop is represented as a do-while (with a proper if-guard
1566      before the loop if needed), where the loop header contains all the
1567      executable statements, and the latch is empty.  */
1568   if (!empty_block_p (loop->latch)
1569       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1570     {
1571       if (dump_enabled_p ())
1572         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1573                          "not vectorized: latch block not empty.\n");
1574       return false;
1575     }
1576
1577   /* Make sure the exit is not abnormal.  */
1578   edge e = single_exit (loop);
1579   if (e->flags & EDGE_ABNORMAL)
1580     {
1581       if (dump_enabled_p ())
1582         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1583                          "not vectorized: abnormal loop exit edge.\n");
1584       return false;
1585     }
1586
1587   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1588                                      number_of_iterationsm1);
1589   if (!*loop_cond)
1590     {
1591       if (dump_enabled_p ())
1592         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1593                          "not vectorized: complicated exit condition.\n");
1594       return false;
1595     }
1596
1597   if (integer_zerop (*assumptions)
1598       || !*number_of_iterations
1599       || chrec_contains_undetermined (*number_of_iterations))
1600     {
1601       if (dump_enabled_p ())
1602         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1603                          "not vectorized: number of iterations cannot be "
1604                          "computed.\n");
1605       return false;
1606     }
1607
1608   if (integer_zerop (*number_of_iterations))
1609     {
1610       if (dump_enabled_p ())
1611         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1612                          "not vectorized: number of iterations = 0.\n");
1613       return false;
1614     }
1615
1616   return true;
1617 }
1618
1619 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1620
1621 loop_vec_info
1622 vect_analyze_loop_form (struct loop *loop)
1623 {
1624   tree assumptions, number_of_iterations, number_of_iterationsm1;
1625   gcond *loop_cond, *inner_loop_cond = NULL;
1626
1627   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1628                                   &assumptions, &number_of_iterationsm1,
1629                                   &number_of_iterations, &inner_loop_cond))
1630     return NULL;
1631
1632   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1633   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1634   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1635   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1636   if (!integer_onep (assumptions))
1637     {
1638       /* We consider to vectorize this loop by versioning it under
1639          some assumptions.  In order to do this, we need to clear
1640          existing information computed by scev and niter analyzer.  */
1641       scev_reset_htab ();
1642       free_numbers_of_iterations_estimates (loop);
1643       /* Also set flag for this loop so that following scev and niter
1644          analysis are done under the assumptions.  */
1645       loop_constraint_set (loop, LOOP_C_FINITE);
1646       /* Also record the assumptions for versioning.  */
1647       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1648     }
1649
1650   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1651     {
1652       if (dump_enabled_p ())
1653         {
1654           dump_printf_loc (MSG_NOTE, vect_location,
1655                            "Symbolic number of iterations is ");
1656           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1657           dump_printf (MSG_NOTE, "\n");
1658         }
1659     }
1660
1661   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1662   if (inner_loop_cond)
1663     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1664       = loop_exit_ctrl_vec_info_type;
1665
1666   gcc_assert (!loop->aux);
1667   loop->aux = loop_vinfo;
1668   return loop_vinfo;
1669 }
1670
1671
1672
1673 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1674    statements update the vectorization factor.  */
1675
1676 static void
1677 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1678 {
1679   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1680   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1681   int nbbs = loop->num_nodes;
1682   poly_uint64 vectorization_factor;
1683   int i;
1684
1685   if (dump_enabled_p ())
1686     dump_printf_loc (MSG_NOTE, vect_location,
1687                      "=== vect_update_vf_for_slp ===\n");
1688
1689   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1690   gcc_assert (known_ne (vectorization_factor, 0U));
1691
1692   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1693      vectorization factor of the loop is the unrolling factor required by
1694      the SLP instances.  If that unrolling factor is 1, we say, that we
1695      perform pure SLP on loop - cross iteration parallelism is not
1696      exploited.  */
1697   bool only_slp_in_loop = true;
1698   for (i = 0; i < nbbs; i++)
1699     {
1700       basic_block bb = bbs[i];
1701       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1702            gsi_next (&si))
1703         {
1704           gimple *stmt = gsi_stmt (si);
1705           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1706           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1707               && STMT_VINFO_RELATED_STMT (stmt_info))
1708             {
1709               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1710               stmt_info = vinfo_for_stmt (stmt);
1711             }
1712           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1713                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1714               && !PURE_SLP_STMT (stmt_info))
1715             /* STMT needs both SLP and loop-based vectorization.  */
1716             only_slp_in_loop = false;
1717         }
1718     }
1719
1720   if (only_slp_in_loop)
1721     {
1722       dump_printf_loc (MSG_NOTE, vect_location,
1723                        "Loop contains only SLP stmts\n");
1724       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1725     }
1726   else
1727     {
1728       dump_printf_loc (MSG_NOTE, vect_location,
1729                        "Loop contains SLP and non-SLP stmts\n");
1730       /* Both the vectorization factor and unroll factor have the form
1731          current_vector_size * X for some rational X, so they must have
1732          a common multiple.  */
1733       vectorization_factor
1734         = force_common_multiple (vectorization_factor,
1735                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1736     }
1737
1738   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1739   if (dump_enabled_p ())
1740     {
1741       dump_printf_loc (MSG_NOTE, vect_location,
1742                        "Updating vectorization factor to ");
1743       dump_dec (MSG_NOTE, vectorization_factor);
1744       dump_printf (MSG_NOTE, ".\n");
1745     }
1746 }
1747
1748 /* Return true if STMT_INFO describes a double reduction phi and if
1749    the other phi in the reduction is also relevant for vectorization.
1750    This rejects cases such as:
1751
1752       outer1:
1753         x_1 = PHI <x_3(outer2), ...>;
1754         ...
1755
1756       inner:
1757         x_2 = ...;
1758         ...
1759
1760       outer2:
1761         x_3 = PHI <x_2(inner)>;
1762
1763    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1764
1765 static bool
1766 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1767 {
1768   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1769     return false;
1770
1771   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1772   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1773 }
1774
1775 /* Function vect_analyze_loop_operations.
1776
1777    Scan the loop stmts and make sure they are all vectorizable.  */
1778
1779 static bool
1780 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1781 {
1782   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1783   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1784   int nbbs = loop->num_nodes;
1785   int i;
1786   stmt_vec_info stmt_info;
1787   bool need_to_vectorize = false;
1788   bool ok;
1789
1790   if (dump_enabled_p ())
1791     dump_printf_loc (MSG_NOTE, vect_location,
1792                      "=== vect_analyze_loop_operations ===\n");
1793
1794   for (i = 0; i < nbbs; i++)
1795     {
1796       basic_block bb = bbs[i];
1797
1798       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1799            gsi_next (&si))
1800         {
1801           gphi *phi = si.phi ();
1802           ok = true;
1803
1804           stmt_info = vinfo_for_stmt (phi);
1805           if (dump_enabled_p ())
1806             {
1807               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1808               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1809             }
1810           if (virtual_operand_p (gimple_phi_result (phi)))
1811             continue;
1812
1813           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1814              (i.e., a phi in the tail of the outer-loop).  */
1815           if (! is_loop_header_bb_p (bb))
1816             {
1817               /* FORNOW: we currently don't support the case that these phis
1818                  are not used in the outerloop (unless it is double reduction,
1819                  i.e., this phi is vect_reduction_def), cause this case
1820                  requires to actually do something here.  */
1821               if (STMT_VINFO_LIVE_P (stmt_info)
1822                   && !vect_active_double_reduction_p (stmt_info))
1823                 {
1824                   if (dump_enabled_p ())
1825                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                                      "Unsupported loop-closed phi in "
1827                                      "outer-loop.\n");
1828                   return false;
1829                 }
1830
1831               /* If PHI is used in the outer loop, we check that its operand
1832                  is defined in the inner loop.  */
1833               if (STMT_VINFO_RELEVANT_P (stmt_info))
1834                 {
1835                   tree phi_op;
1836                   gimple *op_def_stmt;
1837
1838                   if (gimple_phi_num_args (phi) != 1)
1839                     return false;
1840
1841                   phi_op = PHI_ARG_DEF (phi, 0);
1842                   if (TREE_CODE (phi_op) != SSA_NAME)
1843                     return false;
1844
1845                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1846                   if (gimple_nop_p (op_def_stmt)
1847                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1848                       || !vinfo_for_stmt (op_def_stmt))
1849                     return false;
1850
1851                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1852                         != vect_used_in_outer
1853                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1854                            != vect_used_in_outer_by_reduction)
1855                     return false;
1856                 }
1857
1858               continue;
1859             }
1860
1861           gcc_assert (stmt_info);
1862
1863           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1864                || STMT_VINFO_LIVE_P (stmt_info))
1865               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1866             {
1867               /* A scalar-dependence cycle that we don't support.  */
1868               if (dump_enabled_p ())
1869                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1870                                  "not vectorized: scalar dependence cycle.\n");
1871               return false;
1872             }
1873
1874           if (STMT_VINFO_RELEVANT_P (stmt_info))
1875             {
1876               need_to_vectorize = true;
1877               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1878                   && ! PURE_SLP_STMT (stmt_info))
1879                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1880               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1881                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1882                        && ! PURE_SLP_STMT (stmt_info))
1883                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1884             }
1885
1886           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1887           if (ok
1888               && STMT_VINFO_LIVE_P (stmt_info)
1889               && !PURE_SLP_STMT (stmt_info))
1890             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1891
1892           if (!ok)
1893             {
1894               if (dump_enabled_p ())
1895                 {
1896                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1897                                    "not vectorized: relevant phi not "
1898                                    "supported: ");
1899                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1900                 }
1901               return false;
1902             }
1903         }
1904
1905       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1906            gsi_next (&si))
1907         {
1908           gimple *stmt = gsi_stmt (si);
1909           if (!gimple_clobber_p (stmt)
1910               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1911             return false;
1912         }
1913     } /* bbs */
1914
1915   /* All operations in the loop are either irrelevant (deal with loop
1916      control, or dead), or only used outside the loop and can be moved
1917      out of the loop (e.g. invariants, inductions).  The loop can be
1918      optimized away by scalar optimizations.  We're better off not
1919      touching this loop.  */
1920   if (!need_to_vectorize)
1921     {
1922       if (dump_enabled_p ())
1923         dump_printf_loc (MSG_NOTE, vect_location,
1924                          "All the computation can be taken out of the loop.\n");
1925       if (dump_enabled_p ())
1926         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1927                          "not vectorized: redundant loop. no profit to "
1928                          "vectorize.\n");
1929       return false;
1930     }
1931
1932   return true;
1933 }
1934
1935 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1936    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1937    definitely no, or -1 if it's worth retrying.  */
1938
1939 static int
1940 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1941 {
1942   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1943   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1944
1945   /* Only fully-masked loops can have iteration counts less than the
1946      vectorization factor.  */
1947   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1948     {
1949       HOST_WIDE_INT max_niter;
1950
1951       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1952         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1953       else
1954         max_niter = max_stmt_executions_int (loop);
1955
1956       if (max_niter != -1
1957           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1958         {
1959           if (dump_enabled_p ())
1960             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1961                              "not vectorized: iteration count smaller than "
1962                              "vectorization factor.\n");
1963           return 0;
1964         }
1965     }
1966
1967   int min_profitable_iters, min_profitable_estimate;
1968   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1969                                       &min_profitable_estimate);
1970
1971   if (min_profitable_iters < 0)
1972     {
1973       if (dump_enabled_p ())
1974         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1975                          "not vectorized: vectorization not profitable.\n");
1976       if (dump_enabled_p ())
1977         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1978                          "not vectorized: vector version will never be "
1979                          "profitable.\n");
1980       return -1;
1981     }
1982
1983   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1984                                * assumed_vf);
1985
1986   /* Use the cost model only if it is more conservative than user specified
1987      threshold.  */
1988   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1989                                     min_profitable_iters);
1990
1991   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1992
1993   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1994       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1995     {
1996       if (dump_enabled_p ())
1997         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1998                          "not vectorized: vectorization not profitable.\n");
1999       if (dump_enabled_p ())
2000         dump_printf_loc (MSG_NOTE, vect_location,
2001                          "not vectorized: iteration count smaller than user "
2002                          "specified loop bound parameter or minimum profitable "
2003                          "iterations (whichever is more conservative).\n");
2004       return 0;
2005     }
2006
2007   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2008   if (estimated_niter == -1)
2009     estimated_niter = likely_max_stmt_executions_int (loop);
2010   if (estimated_niter != -1
2011       && ((unsigned HOST_WIDE_INT) estimated_niter
2012           < MAX (th, (unsigned) min_profitable_estimate)))
2013     {
2014       if (dump_enabled_p ())
2015         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2016                          "not vectorized: estimated iteration count too "
2017                          "small.\n");
2018       if (dump_enabled_p ())
2019         dump_printf_loc (MSG_NOTE, vect_location,
2020                          "not vectorized: estimated iteration count smaller "
2021                          "than specified loop bound parameter or minimum "
2022                          "profitable iterations (whichever is more "
2023                          "conservative).\n");
2024       return -1;
2025     }
2026
2027   return 1;
2028 }
2029
2030
2031 /* Function vect_analyze_loop_2.
2032
2033    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2034    for it.  The different analyses will record information in the
2035    loop_vec_info struct.  */
2036 static bool
2037 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2038 {
2039   bool ok;
2040   int res;
2041   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2042   poly_uint64 min_vf = 2;
2043   unsigned int n_stmts = 0;
2044
2045   /* The first group of checks is independent of the vector size.  */
2046   fatal = true;
2047
2048   /* Find all data references in the loop (which correspond to vdefs/vuses)
2049      and analyze their evolution in the loop.  */
2050
2051   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2052
2053   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2054   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2055     {
2056       if (dump_enabled_p ())
2057         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2058                          "not vectorized: loop nest containing two "
2059                          "or more consecutive inner loops cannot be "
2060                          "vectorized\n");
2061       return false;
2062     }
2063
2064   for (unsigned i = 0; i < loop->num_nodes; i++)
2065     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2066          !gsi_end_p (gsi); gsi_next (&gsi))
2067       {
2068         gimple *stmt = gsi_stmt (gsi);
2069         if (is_gimple_debug (stmt))
2070           continue;
2071         ++n_stmts;
2072         if (!find_data_references_in_stmt (loop, stmt,
2073                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
2074           {
2075             if (is_gimple_call (stmt) && loop->safelen)
2076               {
2077                 tree fndecl = gimple_call_fndecl (stmt), op;
2078                 if (fndecl != NULL_TREE)
2079                   {
2080                     cgraph_node *node = cgraph_node::get (fndecl);
2081                     if (node != NULL && node->simd_clones != NULL)
2082                       {
2083                         unsigned int j, n = gimple_call_num_args (stmt);
2084                         for (j = 0; j < n; j++)
2085                           {
2086                             op = gimple_call_arg (stmt, j);
2087                             if (DECL_P (op)
2088                                 || (REFERENCE_CLASS_P (op)
2089                                     && get_base_address (op)))
2090                               break;
2091                           }
2092                         op = gimple_call_lhs (stmt);
2093                         /* Ignore #pragma omp declare simd functions
2094                            if they don't have data references in the
2095                            call stmt itself.  */
2096                         if (j == n
2097                             && !(op
2098                                  && (DECL_P (op)
2099                                      || (REFERENCE_CLASS_P (op)
2100                                          && get_base_address (op)))))
2101                           continue;
2102                       }
2103                   }
2104               }
2105             if (dump_enabled_p ())
2106               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2107                                "not vectorized: loop contains function "
2108                                "calls or data references that cannot "
2109                                "be analyzed\n");
2110             return false;
2111           }
2112       }
2113
2114   /* Analyze the data references and also adjust the minimal
2115      vectorization factor according to the loads and stores.  */
2116
2117   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2118   if (!ok)
2119     {
2120       if (dump_enabled_p ())
2121         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2122                          "bad data references.\n");
2123       return false;
2124     }
2125
2126   /* Classify all cross-iteration scalar data-flow cycles.
2127      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2128   vect_analyze_scalar_cycles (loop_vinfo);
2129
2130   vect_pattern_recog (loop_vinfo);
2131
2132   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2133
2134   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2135      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2136
2137   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2138   if (!ok)
2139     {
2140       if (dump_enabled_p ())
2141         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2142                          "bad data access.\n");
2143       return false;
2144     }
2145
2146   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2147
2148   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2149   if (!ok)
2150     {
2151       if (dump_enabled_p ())
2152         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2153                          "unexpected pattern.\n");
2154       return false;
2155     }
2156
2157   /* While the rest of the analysis below depends on it in some way.  */
2158   fatal = false;
2159
2160   /* Analyze data dependences between the data-refs in the loop
2161      and adjust the maximum vectorization factor according to
2162      the dependences.
2163      FORNOW: fail at the first data dependence that we encounter.  */
2164
2165   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2166   if (!ok
2167       || (max_vf != MAX_VECTORIZATION_FACTOR
2168           && maybe_lt (max_vf, min_vf)))
2169     {
2170       if (dump_enabled_p ())
2171             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2172                              "bad data dependence.\n");
2173       return false;
2174     }
2175   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2176
2177   ok = vect_determine_vectorization_factor (loop_vinfo);
2178   if (!ok)
2179     {
2180       if (dump_enabled_p ())
2181         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182                          "can't determine vectorization factor.\n");
2183       return false;
2184     }
2185   if (max_vf != MAX_VECTORIZATION_FACTOR
2186       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2187     {
2188       if (dump_enabled_p ())
2189         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2190                          "bad data dependence.\n");
2191       return false;
2192     }
2193
2194   /* Compute the scalar iteration cost.  */
2195   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2196
2197   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2198   unsigned th;
2199
2200   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2201   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2202   if (!ok)
2203     return false;
2204
2205   /* If there are any SLP instances mark them as pure_slp.  */
2206   bool slp = vect_make_slp_decision (loop_vinfo);
2207   if (slp)
2208     {
2209       /* Find stmts that need to be both vectorized and SLPed.  */
2210       vect_detect_hybrid_slp (loop_vinfo);
2211
2212       /* Update the vectorization factor based on the SLP decision.  */
2213       vect_update_vf_for_slp (loop_vinfo);
2214     }
2215
2216   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2217
2218   /* We don't expect to have to roll back to anything other than an empty
2219      set of rgroups.  */
2220   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2221
2222   /* This is the point where we can re-start analysis with SLP forced off.  */
2223 start_over:
2224
2225   /* Now the vectorization factor is final.  */
2226   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2227   gcc_assert (known_ne (vectorization_factor, 0U));
2228
2229   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2230     {
2231       dump_printf_loc (MSG_NOTE, vect_location,
2232                        "vectorization_factor = ");
2233       dump_dec (MSG_NOTE, vectorization_factor);
2234       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2235                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2236     }
2237
2238   HOST_WIDE_INT max_niter
2239     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2240
2241   /* Analyze the alignment of the data-refs in the loop.
2242      Fail if a data reference is found that cannot be vectorized.  */
2243
2244   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2245   if (!ok)
2246     {
2247       if (dump_enabled_p ())
2248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249                          "bad data alignment.\n");
2250       return false;
2251     }
2252
2253   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2254      It is important to call pruning after vect_analyze_data_ref_accesses,
2255      since we use grouping information gathered by interleaving analysis.  */
2256   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2257   if (!ok)
2258     return false;
2259
2260   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2261      vectorization.  */
2262   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2263     {
2264     /* This pass will decide on using loop versioning and/or loop peeling in
2265        order to enhance the alignment of data references in the loop.  */
2266     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2267     if (!ok)
2268       {
2269         if (dump_enabled_p ())
2270           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271                            "bad data alignment.\n");
2272         return false;
2273       }
2274     }
2275
2276   if (slp)
2277     {
2278       /* Analyze operations in the SLP instances.  Note this may
2279          remove unsupported SLP instances which makes the above
2280          SLP kind detection invalid.  */
2281       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2282       vect_slp_analyze_operations (loop_vinfo);
2283       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2284         goto again;
2285     }
2286
2287   /* Scan all the remaining operations in the loop that are not subject
2288      to SLP and make sure they are vectorizable.  */
2289   ok = vect_analyze_loop_operations (loop_vinfo);
2290   if (!ok)
2291     {
2292       if (dump_enabled_p ())
2293         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2294                          "bad operation or unsupported loop bound.\n");
2295       return false;
2296     }
2297
2298   /* Decide whether to use a fully-masked loop for this vectorization
2299      factor.  */
2300   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2301     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2302        && vect_verify_full_masking (loop_vinfo));
2303   if (dump_enabled_p ())
2304     {
2305       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2306         dump_printf_loc (MSG_NOTE, vect_location,
2307                          "using a fully-masked loop.\n");
2308       else
2309         dump_printf_loc (MSG_NOTE, vect_location,
2310                          "not using a fully-masked loop.\n");
2311     }
2312
2313   /* If epilog loop is required because of data accesses with gaps,
2314      one additional iteration needs to be peeled.  Check if there is
2315      enough iterations for vectorization.  */
2316   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2317       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2318       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2319     {
2320       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2321       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2322
2323       if (known_lt (wi::to_widest (scalar_niters), vf))
2324         {
2325           if (dump_enabled_p ())
2326             dump_printf_loc (MSG_NOTE, vect_location,
2327                              "loop has no enough iterations to support"
2328                              " peeling for gaps.\n");
2329           return false;
2330         }
2331     }
2332
2333   /* Check the costings of the loop make vectorizing worthwhile.  */
2334   res = vect_analyze_loop_costing (loop_vinfo);
2335   if (res < 0)
2336     goto again;
2337   if (!res)
2338     {
2339       if (dump_enabled_p ())
2340         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2341                          "Loop costings not worthwhile.\n");
2342       return false;
2343     }
2344
2345   /* Decide whether we need to create an epilogue loop to handle
2346      remaining scalar iterations.  */
2347   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2348
2349   unsigned HOST_WIDE_INT const_vf;
2350   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2351     /* The main loop handles all iterations.  */
2352     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2353   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2354            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2355     {
2356       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2357                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2358                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2359         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2360     }
2361   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2362            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2363            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2364                 < (unsigned) exact_log2 (const_vf))
2365                /* In case of versioning, check if the maximum number of
2366                   iterations is greater than th.  If they are identical,
2367                   the epilogue is unnecessary.  */
2368                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2369                    || ((unsigned HOST_WIDE_INT) max_niter
2370                        > (th / const_vf) * const_vf))))
2371     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2372
2373   /* If an epilogue loop is required make sure we can create one.  */
2374   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2375       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2376     {
2377       if (dump_enabled_p ())
2378         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2379       if (!vect_can_advance_ivs_p (loop_vinfo)
2380           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2381                                            single_exit (LOOP_VINFO_LOOP
2382                                                          (loop_vinfo))))
2383         {
2384           if (dump_enabled_p ())
2385             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2386                              "not vectorized: can't create required "
2387                              "epilog loop\n");
2388           goto again;
2389         }
2390     }
2391
2392   /* During peeling, we need to check if number of loop iterations is
2393      enough for both peeled prolog loop and vector loop.  This check
2394      can be merged along with threshold check of loop versioning, so
2395      increase threshold for this case if necessary.  */
2396   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2397     {
2398       poly_uint64 niters_th = 0;
2399
2400       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2401         {
2402           /* Niters for peeled prolog loop.  */
2403           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2404             {
2405               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2406               tree vectype
2407                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2408               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2409             }
2410           else
2411             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2412         }
2413
2414       /* Niters for at least one iteration of vectorized loop.  */
2415       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2416         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2417       /* One additional iteration because of peeling for gap.  */
2418       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2419         niters_th += 1;
2420       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2421     }
2422
2423   gcc_assert (known_eq (vectorization_factor,
2424                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2425
2426   /* Ok to vectorize!  */
2427   return true;
2428
2429 again:
2430   /* Try again with SLP forced off but if we didn't do any SLP there is
2431      no point in re-trying.  */
2432   if (!slp)
2433     return false;
2434
2435   /* If there are reduction chains re-trying will fail anyway.  */
2436   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2437     return false;
2438
2439   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2440      via interleaving or lane instructions.  */
2441   slp_instance instance;
2442   slp_tree node;
2443   unsigned i, j;
2444   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2445     {
2446       stmt_vec_info vinfo;
2447       vinfo = vinfo_for_stmt
2448           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2449       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2450         continue;
2451       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2452       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2453       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2454       if (! vect_store_lanes_supported (vectype, size, false)
2455           && ! vect_grouped_store_supported (vectype, size))
2456         return false;
2457       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2458         {
2459           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2460           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2461           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2462           size = STMT_VINFO_GROUP_SIZE (vinfo);
2463           vectype = STMT_VINFO_VECTYPE (vinfo);
2464           if (! vect_load_lanes_supported (vectype, size, false)
2465               && ! vect_grouped_load_supported (vectype, single_element_p,
2466                                                 size))
2467             return false;
2468         }
2469     }
2470
2471   if (dump_enabled_p ())
2472     dump_printf_loc (MSG_NOTE, vect_location,
2473                      "re-trying with SLP disabled\n");
2474
2475   /* Roll back state appropriately.  No SLP this time.  */
2476   slp = false;
2477   /* Restore vectorization factor as it were without SLP.  */
2478   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2479   /* Free the SLP instances.  */
2480   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2481     vect_free_slp_instance (instance);
2482   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2483   /* Reset SLP type to loop_vect on all stmts.  */
2484   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2485     {
2486       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2487       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2488            !gsi_end_p (si); gsi_next (&si))
2489         {
2490           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2491           STMT_SLP_TYPE (stmt_info) = loop_vect;
2492         }
2493       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2494            !gsi_end_p (si); gsi_next (&si))
2495         {
2496           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2497           STMT_SLP_TYPE (stmt_info) = loop_vect;
2498           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2499             {
2500               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2501               STMT_SLP_TYPE (stmt_info) = loop_vect;
2502               for (gimple_stmt_iterator pi
2503                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2504                    !gsi_end_p (pi); gsi_next (&pi))
2505                 {
2506                   gimple *pstmt = gsi_stmt (pi);
2507                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2508                 }
2509             }
2510         }
2511     }
2512   /* Free optimized alias test DDRS.  */
2513   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2514   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2515   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2516   /* Reset target cost data.  */
2517   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2518   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2519     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2520   /* Reset accumulated rgroup information.  */
2521   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2522   /* Reset assorted flags.  */
2523   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2524   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2525   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2526   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2527   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2528
2529   goto start_over;
2530 }
2531
2532 /* Function vect_analyze_loop.
2533
2534    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2535    for it.  The different analyses will record information in the
2536    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2537    be vectorized.  */
2538 loop_vec_info
2539 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2540 {
2541   loop_vec_info loop_vinfo;
2542   auto_vector_sizes vector_sizes;
2543
2544   /* Autodetect first vector size we try.  */
2545   current_vector_size = 0;
2546   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2547   unsigned int next_size = 0;
2548
2549   if (dump_enabled_p ())
2550     dump_printf_loc (MSG_NOTE, vect_location,
2551                      "===== analyze_loop_nest =====\n");
2552
2553   if (loop_outer (loop)
2554       && loop_vec_info_for_loop (loop_outer (loop))
2555       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2556     {
2557       if (dump_enabled_p ())
2558         dump_printf_loc (MSG_NOTE, vect_location,
2559                          "outer-loop already vectorized.\n");
2560       return NULL;
2561     }
2562
2563   poly_uint64 autodetected_vector_size = 0;
2564   while (1)
2565     {
2566       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2567       loop_vinfo = vect_analyze_loop_form (loop);
2568       if (!loop_vinfo)
2569         {
2570           if (dump_enabled_p ())
2571             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2572                              "bad loop form.\n");
2573           return NULL;
2574         }
2575
2576       bool fatal = false;
2577
2578       if (orig_loop_vinfo)
2579         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2580
2581       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2582         {
2583           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2584
2585           return loop_vinfo;
2586         }
2587
2588       delete loop_vinfo;
2589
2590       if (next_size == 0)
2591         autodetected_vector_size = current_vector_size;
2592
2593       if (next_size < vector_sizes.length ()
2594           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2595         next_size += 1;
2596
2597       if (fatal
2598           || next_size == vector_sizes.length ()
2599           || known_eq (current_vector_size, 0U))
2600         return NULL;
2601
2602       /* Try the next biggest vector size.  */
2603       current_vector_size = vector_sizes[next_size++];
2604       if (dump_enabled_p ())
2605         {
2606           dump_printf_loc (MSG_NOTE, vect_location,
2607                            "***** Re-trying analysis with "
2608                            "vector size ");
2609           dump_dec (MSG_NOTE, current_vector_size);
2610           dump_printf (MSG_NOTE, "\n");
2611         }
2612     }
2613 }
2614
2615 /* Return true if there is an in-order reduction function for CODE, storing
2616    it in *REDUC_FN if so.  */
2617
2618 static bool
2619 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2620 {
2621   switch (code)
2622     {
2623     case PLUS_EXPR:
2624       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2625       return true;
2626
2627     default:
2628       return false;
2629     }
2630 }
2631
2632 /* Function reduction_fn_for_scalar_code
2633
2634    Input:
2635    CODE - tree_code of a reduction operations.
2636
2637    Output:
2638    REDUC_FN - the corresponding internal function to be used to reduce the
2639       vector of partial results into a single scalar result, or IFN_LAST
2640       if the operation is a supported reduction operation, but does not have
2641       such an internal function.
2642
2643    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2644
2645 static bool
2646 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2647 {
2648   switch (code)
2649     {
2650       case MAX_EXPR:
2651         *reduc_fn = IFN_REDUC_MAX;
2652         return true;
2653
2654       case MIN_EXPR:
2655         *reduc_fn = IFN_REDUC_MIN;
2656         return true;
2657
2658       case PLUS_EXPR:
2659         *reduc_fn = IFN_REDUC_PLUS;
2660         return true;
2661
2662       case BIT_AND_EXPR:
2663         *reduc_fn = IFN_REDUC_AND;
2664         return true;
2665
2666       case BIT_IOR_EXPR:
2667         *reduc_fn = IFN_REDUC_IOR;
2668         return true;
2669
2670       case BIT_XOR_EXPR:
2671         *reduc_fn = IFN_REDUC_XOR;
2672         return true;
2673
2674       case MULT_EXPR:
2675       case MINUS_EXPR:
2676         *reduc_fn = IFN_LAST;
2677         return true;
2678
2679       default:
2680        return false;
2681     }
2682 }
2683
2684 /* If there is a neutral value X such that SLP reduction NODE would not
2685    be affected by the introduction of additional X elements, return that X,
2686    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2687    is true if the SLP statements perform a single reduction, false if each
2688    statement performs an independent reduction.  */
2689
2690 static tree
2691 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2692                               bool reduc_chain)
2693 {
2694   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2695   gimple *stmt = stmts[0];
2696   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2697   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2698   tree scalar_type = TREE_TYPE (vector_type);
2699   struct loop *loop = gimple_bb (stmt)->loop_father;
2700   gcc_assert (loop);
2701
2702   switch (code)
2703     {
2704     case WIDEN_SUM_EXPR:
2705     case DOT_PROD_EXPR:
2706     case SAD_EXPR:
2707     case PLUS_EXPR:
2708     case MINUS_EXPR:
2709     case BIT_IOR_EXPR:
2710     case BIT_XOR_EXPR:
2711       return build_zero_cst (scalar_type);
2712
2713     case MULT_EXPR:
2714       return build_one_cst (scalar_type);
2715
2716     case BIT_AND_EXPR:
2717       return build_all_ones_cst (scalar_type);
2718
2719     case MAX_EXPR:
2720     case MIN_EXPR:
2721       /* For MIN/MAX the initial values are neutral.  A reduction chain
2722          has only a single initial value, so that value is neutral for
2723          all statements.  */
2724       if (reduc_chain)
2725         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2726       return NULL_TREE;
2727
2728     default:
2729       return NULL_TREE;
2730     }
2731 }
2732
2733 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2734    STMT is printed with a message MSG. */
2735
2736 static void
2737 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2738 {
2739   dump_printf_loc (msg_type, vect_location, "%s", msg);
2740   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2741 }
2742
2743
2744 /* Detect SLP reduction of the form:
2745
2746    #a1 = phi <a5, a0>
2747    a2 = operation (a1)
2748    a3 = operation (a2)
2749    a4 = operation (a3)
2750    a5 = operation (a4)
2751
2752    #a = phi <a5>
2753
2754    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2755    FIRST_STMT is the first reduction stmt in the chain
2756    (a2 = operation (a1)).
2757
2758    Return TRUE if a reduction chain was detected.  */
2759
2760 static bool
2761 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2762                        gimple *first_stmt)
2763 {
2764   struct loop *loop = (gimple_bb (phi))->loop_father;
2765   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2766   enum tree_code code;
2767   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2768   stmt_vec_info use_stmt_info, current_stmt_info;
2769   tree lhs;
2770   imm_use_iterator imm_iter;
2771   use_operand_p use_p;
2772   int nloop_uses, size = 0, n_out_of_loop_uses;
2773   bool found = false;
2774
2775   if (loop != vect_loop)
2776     return false;
2777
2778   lhs = PHI_RESULT (phi);
2779   code = gimple_assign_rhs_code (first_stmt);
2780   while (1)
2781     {
2782       nloop_uses = 0;
2783       n_out_of_loop_uses = 0;
2784       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2785         {
2786           gimple *use_stmt = USE_STMT (use_p);
2787           if (is_gimple_debug (use_stmt))
2788             continue;
2789
2790           /* Check if we got back to the reduction phi.  */
2791           if (use_stmt == phi)
2792             {
2793               loop_use_stmt = use_stmt;
2794               found = true;
2795               break;
2796             }
2797
2798           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2799             {
2800               loop_use_stmt = use_stmt;
2801               nloop_uses++;
2802             }
2803            else
2804              n_out_of_loop_uses++;
2805
2806            /* There are can be either a single use in the loop or two uses in
2807               phi nodes.  */
2808            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2809              return false;
2810         }
2811
2812       if (found)
2813         break;
2814
2815       /* We reached a statement with no loop uses.  */
2816       if (nloop_uses == 0)
2817         return false;
2818
2819       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2820       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2821         return false;
2822
2823       if (!is_gimple_assign (loop_use_stmt)
2824           || code != gimple_assign_rhs_code (loop_use_stmt)
2825           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2826         return false;
2827
2828       /* Insert USE_STMT into reduction chain.  */
2829       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2830       if (current_stmt)
2831         {
2832           current_stmt_info = vinfo_for_stmt (current_stmt);
2833           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2834           GROUP_FIRST_ELEMENT (use_stmt_info)
2835             = GROUP_FIRST_ELEMENT (current_stmt_info);
2836         }
2837       else
2838         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2839
2840       lhs = gimple_assign_lhs (loop_use_stmt);
2841       current_stmt = loop_use_stmt;
2842       size++;
2843    }
2844
2845   if (!found || loop_use_stmt != phi || size < 2)
2846     return false;
2847
2848   /* Swap the operands, if needed, to make the reduction operand be the second
2849      operand.  */
2850   lhs = PHI_RESULT (phi);
2851   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2852   while (next_stmt)
2853     {
2854       if (gimple_assign_rhs2 (next_stmt) == lhs)
2855         {
2856           tree op = gimple_assign_rhs1 (next_stmt);
2857           gimple *def_stmt = NULL;
2858
2859           if (TREE_CODE (op) == SSA_NAME)
2860             def_stmt = SSA_NAME_DEF_STMT (op);
2861
2862           /* Check that the other def is either defined in the loop
2863              ("vect_internal_def"), or it's an induction (defined by a
2864              loop-header phi-node).  */
2865           if (def_stmt
2866               && gimple_bb (def_stmt)
2867               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2868               && (is_gimple_assign (def_stmt)
2869                   || is_gimple_call (def_stmt)
2870                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2871                            == vect_induction_def
2872                   || (gimple_code (def_stmt) == GIMPLE_PHI
2873                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2874                                   == vect_internal_def
2875                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2876             {
2877               lhs = gimple_assign_lhs (next_stmt);
2878               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2879               continue;
2880             }
2881
2882           return false;
2883         }
2884       else
2885         {
2886           tree op = gimple_assign_rhs2 (next_stmt);
2887           gimple *def_stmt = NULL;
2888
2889           if (TREE_CODE (op) == SSA_NAME)
2890             def_stmt = SSA_NAME_DEF_STMT (op);
2891
2892           /* Check that the other def is either defined in the loop
2893             ("vect_internal_def"), or it's an induction (defined by a
2894             loop-header phi-node).  */
2895           if (def_stmt
2896               && gimple_bb (def_stmt)
2897               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2898               && (is_gimple_assign (def_stmt)
2899                   || is_gimple_call (def_stmt)
2900                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2901                               == vect_induction_def
2902                   || (gimple_code (def_stmt) == GIMPLE_PHI
2903                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2904                                   == vect_internal_def
2905                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2906             {
2907               if (dump_enabled_p ())
2908                 {
2909                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2910                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2911                 }
2912
2913               swap_ssa_operands (next_stmt,
2914                                  gimple_assign_rhs1_ptr (next_stmt),
2915                                  gimple_assign_rhs2_ptr (next_stmt));
2916               update_stmt (next_stmt);
2917
2918               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2919                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2920             }
2921           else
2922             return false;
2923         }
2924
2925       lhs = gimple_assign_lhs (next_stmt);
2926       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2927     }
2928
2929   /* Save the chain for further analysis in SLP detection.  */
2930   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2931   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2932   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2933
2934   return true;
2935 }
2936
2937 /* Return true if we need an in-order reduction for operation CODE
2938    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2939    overflow must wrap.  */
2940
2941 static bool
2942 needs_fold_left_reduction_p (tree type, tree_code code,
2943                              bool need_wrapping_integral_overflow)
2944 {
2945   /* CHECKME: check for !flag_finite_math_only too?  */
2946   if (SCALAR_FLOAT_TYPE_P (type))
2947     switch (code)
2948       {
2949       case MIN_EXPR:
2950       case MAX_EXPR:
2951         return false;
2952
2953       default:
2954         return !flag_associative_math;
2955       }
2956
2957   if (INTEGRAL_TYPE_P (type))
2958     {
2959       if (!operation_no_trapping_overflow (type, code))
2960         return true;
2961       if (need_wrapping_integral_overflow
2962           && !TYPE_OVERFLOW_WRAPS (type)
2963           && operation_can_overflow (code))
2964         return true;
2965       return false;
2966     }
2967
2968   if (SAT_FIXED_POINT_TYPE_P (type))
2969     return true;
2970
2971   return false;
2972 }
2973
2974 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2975    reduction operation CODE has a handled computation expression.  */
2976
2977 bool
2978 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2979                       enum tree_code code)
2980 {
2981   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2982   auto_bitmap visited;
2983   tree lookfor = PHI_RESULT (phi);
2984   ssa_op_iter curri;
2985   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2986   while (USE_FROM_PTR (curr) != loop_arg)
2987     curr = op_iter_next_use (&curri);
2988   curri.i = curri.numops;
2989   do
2990     {
2991       path.safe_push (std::make_pair (curri, curr));
2992       tree use = USE_FROM_PTR (curr);
2993       if (use == lookfor)
2994         break;
2995       gimple *def = SSA_NAME_DEF_STMT (use);
2996       if (gimple_nop_p (def)
2997           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2998         {
2999 pop:
3000           do
3001             {
3002               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3003               curri = x.first;
3004               curr = x.second;
3005               do
3006                 curr = op_iter_next_use (&curri);
3007               /* Skip already visited or non-SSA operands (from iterating
3008                  over PHI args).  */
3009               while (curr != NULL_USE_OPERAND_P
3010                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3011                          || ! bitmap_set_bit (visited,
3012                                               SSA_NAME_VERSION
3013                                                 (USE_FROM_PTR (curr)))));
3014             }
3015           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3016           if (curr == NULL_USE_OPERAND_P)
3017             break;
3018         }
3019       else
3020         {
3021           if (gimple_code (def) == GIMPLE_PHI)
3022             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3023           else
3024             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3025           while (curr != NULL_USE_OPERAND_P
3026                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3027                      || ! bitmap_set_bit (visited,
3028                                           SSA_NAME_VERSION
3029                                             (USE_FROM_PTR (curr)))))
3030             curr = op_iter_next_use (&curri);
3031           if (curr == NULL_USE_OPERAND_P)
3032             goto pop;
3033         }
3034     }
3035   while (1);
3036   if (dump_file && (dump_flags & TDF_DETAILS))
3037     {
3038       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3039       unsigned i;
3040       std::pair<ssa_op_iter, use_operand_p> *x;
3041       FOR_EACH_VEC_ELT (path, i, x)
3042         {
3043           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3044           dump_printf (MSG_NOTE, " ");
3045         }
3046       dump_printf (MSG_NOTE, "\n");
3047     }
3048
3049   /* Check whether the reduction path detected is valid.  */
3050   bool fail = path.length () == 0;
3051   bool neg = false;
3052   for (unsigned i = 1; i < path.length (); ++i)
3053     {
3054       gimple *use_stmt = USE_STMT (path[i].second);
3055       tree op = USE_FROM_PTR (path[i].second);
3056       if (! has_single_use (op)
3057           || ! is_gimple_assign (use_stmt))
3058         {
3059           fail = true;
3060           break;
3061         }
3062       if (gimple_assign_rhs_code (use_stmt) != code)
3063         {
3064           if (code == PLUS_EXPR
3065               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3066             {
3067               /* Track whether we negate the reduction value each iteration.  */
3068               if (gimple_assign_rhs2 (use_stmt) == op)
3069                 neg = ! neg;
3070             }
3071           else
3072             {
3073               fail = true;
3074               break;
3075             }
3076         }
3077     }
3078   return ! fail && ! neg;
3079 }
3080
3081
3082 /* Function vect_is_simple_reduction
3083
3084    (1) Detect a cross-iteration def-use cycle that represents a simple
3085    reduction computation.  We look for the following pattern:
3086
3087    loop_header:
3088      a1 = phi < a0, a2 >
3089      a3 = ...
3090      a2 = operation (a3, a1)
3091
3092    or
3093
3094    a3 = ...
3095    loop_header:
3096      a1 = phi < a0, a2 >
3097      a2 = operation (a3, a1)
3098
3099    such that:
3100    1. operation is commutative and associative and it is safe to
3101       change the order of the computation
3102    2. no uses for a2 in the loop (a2 is used out of the loop)
3103    3. no uses of a1 in the loop besides the reduction operation
3104    4. no uses of a1 outside the loop.
3105
3106    Conditions 1,4 are tested here.
3107    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3108
3109    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3110    nested cycles.
3111
3112    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3113    reductions:
3114
3115      a1 = phi < a0, a2 >
3116      inner loop (def of a3)
3117      a2 = phi < a3 >
3118
3119    (4) Detect condition expressions, ie:
3120      for (int i = 0; i < N; i++)
3121        if (a[i] < val)
3122         ret_val = a[i];
3123
3124 */
3125
3126 static gimple *
3127 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3128                           bool *double_reduc,
3129                           bool need_wrapping_integral_overflow,
3130                           enum vect_reduction_type *v_reduc_type)
3131 {
3132   struct loop *loop = (gimple_bb (phi))->loop_father;
3133   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3134   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3135   enum tree_code orig_code, code;
3136   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3137   tree type;
3138   int nloop_uses;
3139   tree name;
3140   imm_use_iterator imm_iter;
3141   use_operand_p use_p;
3142   bool phi_def;
3143
3144   *double_reduc = false;
3145   *v_reduc_type = TREE_CODE_REDUCTION;
3146
3147   tree phi_name = PHI_RESULT (phi);
3148   /* ???  If there are no uses of the PHI result the inner loop reduction
3149      won't be detected as possibly double-reduction by vectorizable_reduction
3150      because that tries to walk the PHI arg from the preheader edge which
3151      can be constant.  See PR60382.  */
3152   if (has_zero_uses (phi_name))
3153     return NULL;
3154   nloop_uses = 0;
3155   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3156     {
3157       gimple *use_stmt = USE_STMT (use_p);
3158       if (is_gimple_debug (use_stmt))
3159         continue;
3160
3161       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3162         {
3163           if (dump_enabled_p ())
3164             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3165                              "intermediate value used outside loop.\n");
3166
3167           return NULL;
3168         }
3169
3170       nloop_uses++;
3171       if (nloop_uses > 1)
3172         {
3173           if (dump_enabled_p ())
3174             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3175                              "reduction value used in loop.\n");
3176           return NULL;
3177         }
3178
3179       phi_use_stmt = use_stmt;
3180     }
3181
3182   edge latch_e = loop_latch_edge (loop);
3183   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3184   if (TREE_CODE (loop_arg) != SSA_NAME)
3185     {
3186       if (dump_enabled_p ())
3187         {
3188           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3189                            "reduction: not ssa_name: ");
3190           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3191           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3192         }
3193       return NULL;
3194     }
3195
3196   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3197   if (is_gimple_assign (def_stmt))
3198     {
3199       name = gimple_assign_lhs (def_stmt);
3200       phi_def = false;
3201     }
3202   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3203     {
3204       name = PHI_RESULT (def_stmt);
3205       phi_def = true;
3206     }
3207   else
3208     {
3209       if (dump_enabled_p ())
3210         {
3211           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3212                            "reduction: unhandled reduction operation: ");
3213           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3214         }
3215       return NULL;
3216     }
3217
3218   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3219     return NULL;
3220
3221   nloop_uses = 0;
3222   auto_vec<gphi *, 3> lcphis;
3223   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3224     {
3225       gimple *use_stmt = USE_STMT (use_p);
3226       if (is_gimple_debug (use_stmt))
3227         continue;
3228       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3229         nloop_uses++;
3230       else
3231         /* We can have more than one loop-closed PHI.  */
3232         lcphis.safe_push (as_a <gphi *> (use_stmt));
3233       if (nloop_uses > 1)
3234         {
3235           if (dump_enabled_p ())
3236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3237                              "reduction used in loop.\n");
3238           return NULL;
3239         }
3240     }
3241
3242   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3243      defined in the inner loop.  */
3244   if (phi_def)
3245     {
3246       op1 = PHI_ARG_DEF (def_stmt, 0);
3247
3248       if (gimple_phi_num_args (def_stmt) != 1
3249           || TREE_CODE (op1) != SSA_NAME)
3250         {
3251           if (dump_enabled_p ())
3252             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3253                              "unsupported phi node definition.\n");
3254
3255           return NULL;
3256         }
3257
3258       def1 = SSA_NAME_DEF_STMT (op1);
3259       if (gimple_bb (def1)
3260           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3261           && loop->inner
3262           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3263           && is_gimple_assign (def1)
3264           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3265         {
3266           if (dump_enabled_p ())
3267             report_vect_op (MSG_NOTE, def_stmt,
3268                             "detected double reduction: ");
3269
3270           *double_reduc = true;
3271           return def_stmt;
3272         }
3273
3274       return NULL;
3275     }
3276
3277   /* If we are vectorizing an inner reduction we are executing that
3278      in the original order only in case we are not dealing with a
3279      double reduction.  */
3280   bool check_reduction = true;
3281   if (flow_loop_nested_p (vect_loop, loop))
3282     {
3283       gphi *lcphi;
3284       unsigned i;
3285       check_reduction = false;
3286       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3287         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3288           {
3289             gimple *use_stmt = USE_STMT (use_p);
3290             if (is_gimple_debug (use_stmt))
3291               continue;
3292             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3293               check_reduction = true;
3294           }
3295     }
3296
3297   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3298   code = orig_code = gimple_assign_rhs_code (def_stmt);
3299
3300   /* We can handle "res -= x[i]", which is non-associative by
3301      simply rewriting this into "res += -x[i]".  Avoid changing
3302      gimple instruction for the first simple tests and only do this
3303      if we're allowed to change code at all.  */
3304   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3305     code = PLUS_EXPR;
3306
3307   if (code == COND_EXPR)
3308     {
3309       if (! nested_in_vect_loop)
3310         *v_reduc_type = COND_REDUCTION;
3311
3312       op3 = gimple_assign_rhs1 (def_stmt);
3313       if (COMPARISON_CLASS_P (op3))
3314         {
3315           op4 = TREE_OPERAND (op3, 1);
3316           op3 = TREE_OPERAND (op3, 0);
3317         }
3318       if (op3 == phi_name || op4 == phi_name)
3319         {
3320           if (dump_enabled_p ())
3321             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3322                             "reduction: condition depends on previous"
3323                             " iteration: ");
3324           return NULL;
3325         }
3326
3327       op1 = gimple_assign_rhs2 (def_stmt);
3328       op2 = gimple_assign_rhs3 (def_stmt);
3329     }
3330   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3331     {
3332       if (dump_enabled_p ())
3333         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3334                         "reduction: not commutative/associative: ");
3335       return NULL;
3336     }
3337   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3338     {
3339       op1 = gimple_assign_rhs1 (def_stmt);
3340       op2 = gimple_assign_rhs2 (def_stmt);
3341     }
3342   else
3343     {
3344       if (dump_enabled_p ())
3345         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3346                         "reduction: not handled operation: ");
3347       return NULL;
3348     }
3349
3350   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3351     {
3352       if (dump_enabled_p ())
3353         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3354                         "reduction: both uses not ssa_names: ");
3355
3356       return NULL;
3357     }
3358
3359   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3360   if ((TREE_CODE (op1) == SSA_NAME
3361        && !types_compatible_p (type,TREE_TYPE (op1)))
3362       || (TREE_CODE (op2) == SSA_NAME
3363           && !types_compatible_p (type, TREE_TYPE (op2)))
3364       || (op3 && TREE_CODE (op3) == SSA_NAME
3365           && !types_compatible_p (type, TREE_TYPE (op3)))
3366       || (op4 && TREE_CODE (op4) == SSA_NAME
3367           && !types_compatible_p (type, TREE_TYPE (op4))))
3368     {
3369       if (dump_enabled_p ())
3370         {
3371           dump_printf_loc (MSG_NOTE, vect_location,
3372                            "reduction: multiple types: operation type: ");
3373           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3374           dump_printf (MSG_NOTE, ", operands types: ");
3375           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3376                              TREE_TYPE (op1));
3377           dump_printf (MSG_NOTE, ",");
3378           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3379                              TREE_TYPE (op2));
3380           if (op3)
3381             {
3382               dump_printf (MSG_NOTE, ",");
3383               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3384                                  TREE_TYPE (op3));
3385             }
3386
3387           if (op4)
3388             {
3389               dump_printf (MSG_NOTE, ",");
3390               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3391                                  TREE_TYPE (op4));
3392             }
3393           dump_printf (MSG_NOTE, "\n");
3394         }
3395
3396       return NULL;
3397     }
3398
3399   /* Check whether it's ok to change the order of the computation.
3400      Generally, when vectorizing a reduction we change the order of the
3401      computation.  This may change the behavior of the program in some
3402      cases, so we need to check that this is ok.  One exception is when
3403      vectorizing an outer-loop: the inner-loop is executed sequentially,
3404      and therefore vectorizing reductions in the inner-loop during
3405      outer-loop vectorization is safe.  */
3406   if (check_reduction
3407       && *v_reduc_type == TREE_CODE_REDUCTION
3408       && needs_fold_left_reduction_p (type, code,
3409                                       need_wrapping_integral_overflow))
3410     *v_reduc_type = FOLD_LEFT_REDUCTION;
3411
3412   /* Reduction is safe. We're dealing with one of the following:
3413      1) integer arithmetic and no trapv
3414      2) floating point arithmetic, and special flags permit this optimization
3415      3) nested cycle (i.e., outer loop vectorization).  */
3416   if (TREE_CODE (op1) == SSA_NAME)
3417     def1 = SSA_NAME_DEF_STMT (op1);
3418
3419   if (TREE_CODE (op2) == SSA_NAME)
3420     def2 = SSA_NAME_DEF_STMT (op2);
3421
3422   if (code != COND_EXPR
3423       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3424     {
3425       if (dump_enabled_p ())
3426         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3427       return NULL;
3428     }
3429
3430   /* Check that one def is the reduction def, defined by PHI,
3431      the other def is either defined in the loop ("vect_internal_def"),
3432      or it's an induction (defined by a loop-header phi-node).  */
3433
3434   if (def2 && def2 == phi
3435       && (code == COND_EXPR
3436           || !def1 || gimple_nop_p (def1)
3437           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3438           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3439               && (is_gimple_assign (def1)
3440                   || is_gimple_call (def1)
3441                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3442                       == vect_induction_def
3443                   || (gimple_code (def1) == GIMPLE_PHI
3444                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3445                           == vect_internal_def
3446                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3447     {
3448       if (dump_enabled_p ())
3449         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3450       return def_stmt;
3451     }
3452
3453   if (def1 && def1 == phi
3454       && (code == COND_EXPR
3455           || !def2 || gimple_nop_p (def2)
3456           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3457           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3458               && (is_gimple_assign (def2)
3459                   || is_gimple_call (def2)
3460                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3461                        == vect_induction_def
3462                   || (gimple_code (def2) == GIMPLE_PHI
3463                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3464                            == vect_internal_def
3465                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3466     {
3467       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3468         {
3469           /* Check if we can swap operands (just for simplicity - so that
3470              the rest of the code can assume that the reduction variable
3471              is always the last (second) argument).  */
3472           if (code == COND_EXPR)
3473             {
3474               /* Swap cond_expr by inverting the condition.  */
3475               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3476               enum tree_code invert_code = ERROR_MARK;
3477               enum tree_code cond_code = TREE_CODE (cond_expr);
3478
3479               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3480                 {
3481                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3482                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3483                 }
3484               if (invert_code != ERROR_MARK)
3485                 {
3486                   TREE_SET_CODE (cond_expr, invert_code);
3487                   swap_ssa_operands (def_stmt,
3488                                      gimple_assign_rhs2_ptr (def_stmt),
3489                                      gimple_assign_rhs3_ptr (def_stmt));
3490                 }
3491               else
3492                 {
3493                   if (dump_enabled_p ())
3494                     report_vect_op (MSG_NOTE, def_stmt,
3495                                     "detected reduction: cannot swap operands "
3496                                     "for cond_expr");
3497                   return NULL;
3498                 }
3499             }
3500           else
3501             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3502                                gimple_assign_rhs2_ptr (def_stmt));
3503
3504           if (dump_enabled_p ())
3505             report_vect_op (MSG_NOTE, def_stmt,
3506                             "detected reduction: need to swap operands: ");
3507
3508           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3509             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3510         }
3511       else
3512         {
3513           if (dump_enabled_p ())
3514             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3515         }
3516
3517       return def_stmt;
3518     }
3519
3520   /* Try to find SLP reduction chain.  */
3521   if (! nested_in_vect_loop
3522       && code != COND_EXPR
3523       && orig_code != MINUS_EXPR
3524       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3525     {
3526       if (dump_enabled_p ())
3527         report_vect_op (MSG_NOTE, def_stmt,
3528                         "reduction: detected reduction chain: ");
3529
3530       return def_stmt;
3531     }
3532
3533   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3534   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3535   while (first)
3536     {
3537       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3538       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3539       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3540       first = next;
3541     }
3542
3543   /* Look for the expression computing loop_arg from loop PHI result.  */
3544   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3545                             code))
3546     return def_stmt;
3547
3548   if (dump_enabled_p ())
3549     {
3550       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3551                       "reduction: unknown pattern: ");
3552     }
3553
3554   return NULL;
3555 }
3556
3557 /* Wrapper around vect_is_simple_reduction, which will modify code
3558    in-place if it enables detection of more reductions.  Arguments
3559    as there.  */
3560
3561 gimple *
3562 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3563                              bool *double_reduc,
3564                              bool need_wrapping_integral_overflow)
3565 {
3566   enum vect_reduction_type v_reduc_type;
3567   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3568                                           need_wrapping_integral_overflow,
3569                                           &v_reduc_type);
3570   if (def)
3571     {
3572       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3573       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3574       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3575       reduc_def_info = vinfo_for_stmt (def);
3576       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3577       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3578     }
3579   return def;
3580 }
3581
3582 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3583 int
3584 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3585                              int *peel_iters_epilogue,
3586                              stmt_vector_for_cost *scalar_cost_vec,
3587                              stmt_vector_for_cost *prologue_cost_vec,
3588                              stmt_vector_for_cost *epilogue_cost_vec)
3589 {
3590   int retval = 0;
3591   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3592
3593   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3594     {
3595       *peel_iters_epilogue = assumed_vf / 2;
3596       if (dump_enabled_p ())
3597         dump_printf_loc (MSG_NOTE, vect_location,
3598                          "cost model: epilogue peel iters set to vf/2 "
3599                          "because loop iterations are unknown .\n");
3600
3601       /* If peeled iterations are known but number of scalar loop
3602          iterations are unknown, count a taken branch per peeled loop.  */
3603       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3604                                  NULL, 0, vect_prologue);
3605       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3606                                  NULL, 0, vect_epilogue);
3607     }
3608   else
3609     {
3610       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3611       peel_iters_prologue = niters < peel_iters_prologue ?
3612                             niters : peel_iters_prologue;
3613       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3614       /* If we need to peel for gaps, but no peeling is required, we have to
3615          peel VF iterations.  */
3616       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3617         *peel_iters_epilogue = assumed_vf;
3618     }
3619
3620   stmt_info_for_cost *si;
3621   int j;
3622   if (peel_iters_prologue)
3623     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3624         {
3625           stmt_vec_info stmt_info
3626             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3627           retval += record_stmt_cost (prologue_cost_vec,
3628                                       si->count * peel_iters_prologue,
3629                                       si->kind, stmt_info, si->misalign,
3630                                       vect_prologue);
3631         }
3632   if (*peel_iters_epilogue)
3633     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3634         {
3635           stmt_vec_info stmt_info
3636             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3637           retval += record_stmt_cost (epilogue_cost_vec,
3638                                       si->count * *peel_iters_epilogue,
3639                                       si->kind, stmt_info, si->misalign,
3640                                       vect_epilogue);
3641         }
3642
3643   return retval;
3644 }
3645
3646 /* Function vect_estimate_min_profitable_iters
3647
3648    Return the number of iterations required for the vector version of the
3649    loop to be profitable relative to the cost of the scalar version of the
3650    loop.
3651
3652    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3653    of iterations for vectorization.  -1 value means loop vectorization
3654    is not profitable.  This returned value may be used for dynamic
3655    profitability check.
3656
3657    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3658    for static check against estimated number of iterations.  */
3659
3660 static void
3661 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3662                                     int *ret_min_profitable_niters,
3663                                     int *ret_min_profitable_estimate)
3664 {
3665   int min_profitable_iters;
3666   int min_profitable_estimate;
3667   int peel_iters_prologue;
3668   int peel_iters_epilogue;
3669   unsigned vec_inside_cost = 0;
3670   int vec_outside_cost = 0;
3671   unsigned vec_prologue_cost = 0;
3672   unsigned vec_epilogue_cost = 0;
3673   int scalar_single_iter_cost = 0;
3674   int scalar_outside_cost = 0;
3675   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3676   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3677   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3678
3679   /* Cost model disabled.  */
3680   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3681     {
3682       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3683       *ret_min_profitable_niters = 0;
3684       *ret_min_profitable_estimate = 0;
3685       return;
3686     }
3687
3688   /* Requires loop versioning tests to handle misalignment.  */
3689   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3690     {
3691       /*  FIXME: Make cost depend on complexity of individual check.  */
3692       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3693       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3694                             vect_prologue);
3695       dump_printf (MSG_NOTE,
3696                    "cost model: Adding cost of checks for loop "
3697                    "versioning to treat misalignment.\n");
3698     }
3699
3700   /* Requires loop versioning with alias checks.  */
3701   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3702     {
3703       /*  FIXME: Make cost depend on complexity of individual check.  */
3704       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3705       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3706                             vect_prologue);
3707       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3708       if (len)
3709         /* Count LEN - 1 ANDs and LEN comparisons.  */
3710         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3711                               NULL, 0, vect_prologue);
3712       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3713       if (len)
3714         {
3715           /* Count LEN - 1 ANDs and LEN comparisons.  */
3716           unsigned int nstmts = len * 2 - 1;
3717           /* +1 for each bias that needs adding.  */
3718           for (unsigned int i = 0; i < len; ++i)
3719             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3720               nstmts += 1;
3721           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3722                                 NULL, 0, vect_prologue);
3723         }
3724       dump_printf (MSG_NOTE,
3725                    "cost model: Adding cost of checks for loop "
3726                    "versioning aliasing.\n");
3727     }
3728
3729   /* Requires loop versioning with niter checks.  */
3730   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3731     {
3732       /*  FIXME: Make cost depend on complexity of individual check.  */
3733       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3734                             vect_prologue);
3735       dump_printf (MSG_NOTE,
3736                    "cost model: Adding cost of checks for loop "
3737                    "versioning niters.\n");
3738     }
3739
3740   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3741     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3742                           vect_prologue);
3743
3744   /* Count statements in scalar loop.  Using this as scalar cost for a single
3745      iteration for now.
3746
3747      TODO: Add outer loop support.
3748
3749      TODO: Consider assigning different costs to different scalar
3750      statements.  */
3751
3752   scalar_single_iter_cost
3753     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3754
3755   /* Add additional cost for the peeled instructions in prologue and epilogue
3756      loop.  (For fully-masked loops there will be no peeling.)
3757
3758      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3759      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3760
3761      TODO: Build an expression that represents peel_iters for prologue and
3762      epilogue to be used in a run-time test.  */
3763
3764   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3765     {
3766       peel_iters_prologue = 0;
3767       peel_iters_epilogue = 0;
3768
3769       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3770         {
3771           /* We need to peel exactly one iteration.  */
3772           peel_iters_epilogue += 1;
3773           stmt_info_for_cost *si;
3774           int j;
3775           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3776                             j, si)
3777             {
3778               struct _stmt_vec_info *stmt_info
3779                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3780               (void) add_stmt_cost (target_cost_data, si->count,
3781                                     si->kind, stmt_info, si->misalign,
3782                                     vect_epilogue);
3783             }
3784         }
3785     }
3786   else if (npeel < 0)
3787     {
3788       peel_iters_prologue = assumed_vf / 2;
3789       dump_printf (MSG_NOTE, "cost model: "
3790                    "prologue peel iters set to vf/2.\n");
3791
3792       /* If peeling for alignment is unknown, loop bound of main loop becomes
3793          unknown.  */
3794       peel_iters_epilogue = assumed_vf / 2;
3795       dump_printf (MSG_NOTE, "cost model: "
3796                    "epilogue peel iters set to vf/2 because "
3797                    "peeling for alignment is unknown.\n");
3798
3799       /* If peeled iterations are unknown, count a taken branch and a not taken
3800          branch per peeled loop. Even if scalar loop iterations are known,
3801          vector iterations are not known since peeled prologue iterations are
3802          not known. Hence guards remain the same.  */
3803       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3804                             NULL, 0, vect_prologue);
3805       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3806                             NULL, 0, vect_prologue);
3807       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3808                             NULL, 0, vect_epilogue);
3809       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3810                             NULL, 0, vect_epilogue);
3811       stmt_info_for_cost *si;
3812       int j;
3813       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3814         {
3815           struct _stmt_vec_info *stmt_info
3816             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3817           (void) add_stmt_cost (target_cost_data,
3818                                 si->count * peel_iters_prologue,
3819                                 si->kind, stmt_info, si->misalign,
3820                                 vect_prologue);
3821           (void) add_stmt_cost (target_cost_data,
3822                                 si->count * peel_iters_epilogue,
3823                                 si->kind, stmt_info, si->misalign,
3824                                 vect_epilogue);
3825         }
3826     }
3827   else
3828     {
3829       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3830       stmt_info_for_cost *si;
3831       int j;
3832       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3833
3834       prologue_cost_vec.create (2);
3835       epilogue_cost_vec.create (2);
3836       peel_iters_prologue = npeel;
3837
3838       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3839                                           &peel_iters_epilogue,
3840                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3841                                             (loop_vinfo),
3842                                           &prologue_cost_vec,
3843                                           &epilogue_cost_vec);
3844
3845       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3846         {
3847           struct _stmt_vec_info *stmt_info
3848             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3849           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3850                                 si->misalign, vect_prologue);
3851         }
3852
3853       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3854         {
3855           struct _stmt_vec_info *stmt_info
3856             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3857           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3858                                 si->misalign, vect_epilogue);
3859         }
3860
3861       prologue_cost_vec.release ();
3862       epilogue_cost_vec.release ();
3863     }
3864
3865   /* FORNOW: The scalar outside cost is incremented in one of the
3866      following ways:
3867
3868      1. The vectorizer checks for alignment and aliasing and generates
3869      a condition that allows dynamic vectorization.  A cost model
3870      check is ANDED with the versioning condition.  Hence scalar code
3871      path now has the added cost of the versioning check.
3872
3873        if (cost > th & versioning_check)
3874          jmp to vector code
3875
3876      Hence run-time scalar is incremented by not-taken branch cost.
3877
3878      2. The vectorizer then checks if a prologue is required.  If the
3879      cost model check was not done before during versioning, it has to
3880      be done before the prologue check.
3881
3882        if (cost <= th)
3883          prologue = scalar_iters
3884        if (prologue == 0)
3885          jmp to vector code
3886        else
3887          execute prologue
3888        if (prologue == num_iters)
3889          go to exit
3890
3891      Hence the run-time scalar cost is incremented by a taken branch,
3892      plus a not-taken branch, plus a taken branch cost.
3893
3894      3. The vectorizer then checks if an epilogue is required.  If the
3895      cost model check was not done before during prologue check, it
3896      has to be done with the epilogue check.
3897
3898        if (prologue == 0)
3899          jmp to vector code
3900        else
3901          execute prologue
3902        if (prologue == num_iters)
3903          go to exit
3904        vector code:
3905          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3906            jmp to epilogue
3907
3908      Hence the run-time scalar cost should be incremented by 2 taken
3909      branches.
3910
3911      TODO: The back end may reorder the BBS's differently and reverse
3912      conditions/branch directions.  Change the estimates below to
3913      something more reasonable.  */
3914
3915   /* If the number of iterations is known and we do not do versioning, we can
3916      decide whether to vectorize at compile time.  Hence the scalar version
3917      do not carry cost model guard costs.  */
3918   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3919       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3920     {
3921       /* Cost model check occurs at versioning.  */
3922       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3923         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3924       else
3925         {
3926           /* Cost model check occurs at prologue generation.  */
3927           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3928             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3929               + vect_get_stmt_cost (cond_branch_not_taken);
3930           /* Cost model check occurs at epilogue generation.  */
3931           else
3932             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3933         }
3934     }
3935
3936   /* Complete the target-specific cost calculations.  */
3937   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3938                &vec_inside_cost, &vec_epilogue_cost);
3939
3940   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3941
3942   if (dump_enabled_p ())
3943     {
3944       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3945       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3946                    vec_inside_cost);
3947       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3948                    vec_prologue_cost);
3949       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3950                    vec_epilogue_cost);
3951       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3952                    scalar_single_iter_cost);
3953       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3954                    scalar_outside_cost);
3955       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3956                    vec_outside_cost);
3957       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3958                    peel_iters_prologue);
3959       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3960                    peel_iters_epilogue);
3961     }
3962
3963   /* Calculate number of iterations required to make the vector version
3964      profitable, relative to the loop bodies only.  The following condition
3965      must hold true:
3966      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3967      where
3968      SIC = scalar iteration cost, VIC = vector iteration cost,
3969      VOC = vector outside cost, VF = vectorization factor,
3970      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3971      SOC = scalar outside cost for run time cost model check.  */
3972
3973   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3974     {
3975       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3976                               * assumed_vf
3977                               - vec_inside_cost * peel_iters_prologue
3978                               - vec_inside_cost * peel_iters_epilogue);
3979       if (min_profitable_iters <= 0)
3980         min_profitable_iters = 0;
3981       else
3982         {
3983           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3984                                    - vec_inside_cost);
3985
3986           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3987               <= (((int) vec_inside_cost * min_profitable_iters)
3988                   + (((int) vec_outside_cost - scalar_outside_cost)
3989                      * assumed_vf)))
3990             min_profitable_iters++;
3991         }
3992     }
3993   /* vector version will never be profitable.  */
3994   else
3995     {
3996       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3997         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3998                     "did not happen for a simd loop");
3999
4000       if (dump_enabled_p ())
4001         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4002                          "cost model: the vector iteration cost = %d "
4003                          "divided by the scalar iteration cost = %d "
4004                          "is greater or equal to the vectorization factor = %d"
4005                          ".\n",
4006                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4007       *ret_min_profitable_niters = -1;
4008       *ret_min_profitable_estimate = -1;
4009       return;
4010     }
4011
4012   dump_printf (MSG_NOTE,
4013                "  Calculated minimum iters for profitability: %d\n",
4014                min_profitable_iters);
4015
4016   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4017       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4018     /* We want the vectorized loop to execute at least once.  */
4019     min_profitable_iters = assumed_vf + peel_iters_prologue;
4020
4021   if (dump_enabled_p ())
4022     dump_printf_loc (MSG_NOTE, vect_location,
4023                      "  Runtime profitability threshold = %d\n",
4024                      min_profitable_iters);
4025
4026   *ret_min_profitable_niters = min_profitable_iters;
4027
4028   /* Calculate number of iterations required to make the vector version
4029      profitable, relative to the loop bodies only.
4030
4031      Non-vectorized variant is SIC * niters and it must win over vector
4032      variant on the expected loop trip count.  The following condition must hold true:
4033      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
4034
4035   if (vec_outside_cost <= 0)
4036     min_profitable_estimate = 0;
4037   else
4038     {
4039       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4040                                  * assumed_vf
4041                                  - vec_inside_cost * peel_iters_prologue
4042                                  - vec_inside_cost * peel_iters_epilogue)
4043                                  / ((scalar_single_iter_cost * assumed_vf)
4044                                    - vec_inside_cost);
4045     }
4046   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4047   if (dump_enabled_p ())
4048     dump_printf_loc (MSG_NOTE, vect_location,
4049                      "  Static estimate profitability threshold = %d\n",
4050                      min_profitable_estimate);
4051
4052   *ret_min_profitable_estimate = min_profitable_estimate;
4053 }
4054
4055 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4056    vector elements (not bits) for a vector with NELT elements.  */
4057 static void
4058 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4059                               vec_perm_builder *sel)
4060 {
4061   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4062      by vec_perm_indices.  */
4063   sel->new_vector (nelt, 1, 3);
4064   for (unsigned int i = 0; i < 3; i++)
4065     sel->quick_push (i + offset);
4066 }
4067
4068 /* Checks whether the target supports whole-vector shifts for vectors of mode
4069    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4070    it supports vec_perm_const with masks for all necessary shift amounts.  */
4071 static bool
4072 have_whole_vector_shift (machine_mode mode)
4073 {
4074   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4075     return true;
4076
4077   /* Variable-length vectors should be handled via the optab.  */
4078   unsigned int nelt;
4079   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4080     return false;
4081
4082   vec_perm_builder sel;
4083   vec_perm_indices indices;
4084   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4085     {
4086       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4087       indices.new_vector (sel, 2, nelt);
4088       if (!can_vec_perm_const_p (mode, indices, false))
4089         return false;
4090     }
4091   return true;
4092 }
4093
4094 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4095    functions. Design better to avoid maintenance issues.  */
4096
4097 /* Function vect_model_reduction_cost.
4098
4099    Models cost for a reduction operation, including the vector ops
4100    generated within the strip-mine loop, the initial definition before
4101    the loop, and the epilogue code that must be generated.  */
4102
4103 static void
4104 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4105                            int ncopies)
4106 {
4107   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4108   enum tree_code code;
4109   optab optab;
4110   tree vectype;
4111   gimple *orig_stmt;
4112   machine_mode mode;
4113   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4114   struct loop *loop = NULL;
4115   void *target_cost_data;
4116
4117   if (loop_vinfo)
4118     {
4119       loop = LOOP_VINFO_LOOP (loop_vinfo);
4120       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4121     }
4122   else
4123     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4124
4125   /* Condition reductions generate two reductions in the loop.  */
4126   vect_reduction_type reduction_type
4127     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4128   if (reduction_type == COND_REDUCTION)
4129     ncopies *= 2;
4130
4131   vectype = STMT_VINFO_VECTYPE (stmt_info);
4132   mode = TYPE_MODE (vectype);
4133   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4134
4135   if (!orig_stmt)
4136     orig_stmt = STMT_VINFO_STMT (stmt_info);
4137
4138   code = gimple_assign_rhs_code (orig_stmt);
4139
4140   if (reduction_type == EXTRACT_LAST_REDUCTION
4141       || reduction_type == FOLD_LEFT_REDUCTION)
4142     {
4143       /* No extra instructions needed in the prologue.  */
4144       prologue_cost = 0;
4145
4146       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4147         /* Count one reduction-like operation per vector.  */
4148         inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4149                                      stmt_info, 0, vect_body);
4150       else
4151         {
4152           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4153           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4154           inside_cost = add_stmt_cost (target_cost_data,  nelements,
4155                                        vec_to_scalar, stmt_info, 0,
4156                                        vect_body);
4157           inside_cost += add_stmt_cost (target_cost_data,  nelements,
4158                                         scalar_stmt, stmt_info, 0,
4159                                         vect_body);
4160         }
4161     }
4162   else
4163     {
4164       /* Add in cost for initial definition.
4165          For cond reduction we have four vectors: initial index, step,
4166          initial result of the data reduction, initial value of the index
4167          reduction.  */
4168       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4169       prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4170                                       scalar_to_vec, stmt_info, 0,
4171                                       vect_prologue);
4172
4173       /* Cost of reduction op inside loop.  */
4174       inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4175                                    stmt_info, 0, vect_body);
4176     }
4177
4178   /* Determine cost of epilogue code.
4179
4180      We have a reduction operator that will reduce the vector in one statement.
4181      Also requires scalar extract.  */
4182
4183   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4184     {
4185       if (reduc_fn != IFN_LAST)
4186         {
4187           if (reduction_type == COND_REDUCTION)
4188             {
4189               /* An EQ stmt and an COND_EXPR stmt.  */
4190               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4191                                               vector_stmt, stmt_info, 0,
4192                                               vect_epilogue);
4193               /* Reduction of the max index and a reduction of the found
4194                  values.  */
4195               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4196                                               vec_to_scalar, stmt_info, 0,
4197                                               vect_epilogue);
4198               /* A broadcast of the max value.  */
4199               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4200                                               scalar_to_vec, stmt_info, 0,
4201                                               vect_epilogue);
4202             }
4203           else
4204             {
4205               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4206                                               stmt_info, 0, vect_epilogue);
4207               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4208                                               vec_to_scalar, stmt_info, 0,
4209                                               vect_epilogue);
4210             }
4211         }
4212       else if (reduction_type == COND_REDUCTION)
4213         {
4214           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4215           /* Extraction of scalar elements.  */
4216           epilogue_cost += add_stmt_cost (target_cost_data,
4217                                           2 * estimated_nunits,
4218                                           vec_to_scalar, stmt_info, 0,
4219                                           vect_epilogue);
4220           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4221           epilogue_cost += add_stmt_cost (target_cost_data,
4222                                           2 * estimated_nunits - 3,
4223                                           scalar_stmt, stmt_info, 0,
4224                                           vect_epilogue);
4225         }
4226       else if (reduction_type == EXTRACT_LAST_REDUCTION
4227                || reduction_type == FOLD_LEFT_REDUCTION)
4228         /* No extra instructions need in the epilogue.  */
4229         ;
4230       else
4231         {
4232           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4233           tree bitsize =
4234             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4235           int element_bitsize = tree_to_uhwi (bitsize);
4236           int nelements = vec_size_in_bits / element_bitsize;
4237
4238           if (code == COND_EXPR)
4239             code = MAX_EXPR;
4240
4241           optab = optab_for_tree_code (code, vectype, optab_default);
4242
4243           /* We have a whole vector shift available.  */
4244           if (optab != unknown_optab
4245               && VECTOR_MODE_P (mode)
4246               && optab_handler (optab, mode) != CODE_FOR_nothing
4247               && have_whole_vector_shift (mode))
4248             {
4249               /* Final reduction via vector shifts and the reduction operator.
4250                  Also requires scalar extract.  */
4251               epilogue_cost += add_stmt_cost (target_cost_data,
4252                                               exact_log2 (nelements) * 2,
4253                                               vector_stmt, stmt_info, 0,
4254                                               vect_epilogue);
4255               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4256                                               vec_to_scalar, stmt_info, 0,
4257                                               vect_epilogue);
4258             }
4259           else
4260             /* Use extracts and reduction op for final reduction.  For N
4261                elements, we have N extracts and N-1 reduction ops.  */
4262             epilogue_cost += add_stmt_cost (target_cost_data,
4263                                             nelements + nelements - 1,
4264                                             vector_stmt, stmt_info, 0,
4265                                             vect_epilogue);
4266         }
4267     }
4268
4269   if (dump_enabled_p ())
4270     dump_printf (MSG_NOTE,
4271                  "vect_model_reduction_cost: inside_cost = %d, "
4272                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4273                  prologue_cost, epilogue_cost);
4274 }
4275
4276
4277 /* Function vect_model_induction_cost.
4278
4279    Models cost for induction operations.  */
4280
4281 static void
4282 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4283 {
4284   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4285   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4286   unsigned inside_cost, prologue_cost;
4287
4288   if (PURE_SLP_STMT (stmt_info))
4289     return;
4290
4291   /* loop cost for vec_loop.  */
4292   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4293                                stmt_info, 0, vect_body);
4294
4295   /* prologue cost for vec_init and vec_step.  */
4296   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4297                                  stmt_info, 0, vect_prologue);
4298
4299   if (dump_enabled_p ())
4300     dump_printf_loc (MSG_NOTE, vect_location,
4301                      "vect_model_induction_cost: inside_cost = %d, "
4302                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4303 }
4304
4305
4306
4307 /* Function get_initial_def_for_reduction
4308
4309    Input:
4310    STMT - a stmt that performs a reduction operation in the loop.
4311    INIT_VAL - the initial value of the reduction variable
4312
4313    Output:
4314    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4315         of the reduction (used for adjusting the epilog - see below).
4316    Return a vector variable, initialized according to the operation that STMT
4317         performs. This vector will be used as the initial value of the
4318         vector of partial results.
4319
4320    Option1 (adjust in epilog): Initialize the vector as follows:
4321      add/bit or/xor:    [0,0,...,0,0]
4322      mult/bit and:      [1,1,...,1,1]
4323      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4324    and when necessary (e.g. add/mult case) let the caller know
4325    that it needs to adjust the result by init_val.
4326
4327    Option2: Initialize the vector as follows:
4328      add/bit or/xor:    [init_val,0,0,...,0]
4329      mult/bit and:      [init_val,1,1,...,1]
4330      min/max/cond_expr: [init_val,init_val,...,init_val]
4331    and no adjustments are needed.
4332
4333    For example, for the following code:
4334
4335    s = init_val;
4336    for (i=0;i<n;i++)
4337      s = s + a[i];
4338
4339    STMT is 's = s + a[i]', and the reduction variable is 's'.
4340    For a vector of 4 units, we want to return either [0,0,0,init_val],
4341    or [0,0,0,0] and let the caller know that it needs to adjust
4342    the result at the end by 'init_val'.
4343
4344    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4345    initialization vector is simpler (same element in all entries), if
4346    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4347
4348    A cost model should help decide between these two schemes.  */
4349
4350 tree
4351 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4352                                tree *adjustment_def)
4353 {
4354   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4355   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4356   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4357   tree scalar_type = TREE_TYPE (init_val);
4358   tree vectype = get_vectype_for_scalar_type (scalar_type);
4359   enum tree_code code = gimple_assign_rhs_code (stmt);
4360   tree def_for_init;
4361   tree init_def;
4362   bool nested_in_vect_loop = false;
4363   REAL_VALUE_TYPE real_init_val = dconst0;
4364   int int_init_val = 0;
4365   gimple *def_stmt = NULL;
4366   gimple_seq stmts = NULL;
4367
4368   gcc_assert (vectype);
4369
4370   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4371               || SCALAR_FLOAT_TYPE_P (scalar_type));
4372
4373   if (nested_in_vect_loop_p (loop, stmt))
4374     nested_in_vect_loop = true;
4375   else
4376     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4377
4378   /* In case of double reduction we only create a vector variable to be put
4379      in the reduction phi node.  The actual statement creation is done in
4380      vect_create_epilog_for_reduction.  */
4381   if (adjustment_def && nested_in_vect_loop
4382       && TREE_CODE (init_val) == SSA_NAME
4383       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4384       && gimple_code (def_stmt) == GIMPLE_PHI
4385       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4386       && vinfo_for_stmt (def_stmt)
4387       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4388           == vect_double_reduction_def)
4389     {
4390       *adjustment_def = NULL;
4391       return vect_create_destination_var (init_val, vectype);
4392     }
4393
4394   vect_reduction_type reduction_type
4395     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4396
4397   /* In case of a nested reduction do not use an adjustment def as
4398      that case is not supported by the epilogue generation correctly
4399      if ncopies is not one.  */
4400   if (adjustment_def && nested_in_vect_loop)
4401     {
4402       *adjustment_def = NULL;
4403       return vect_get_vec_def_for_operand (init_val, stmt);
4404     }
4405
4406   switch (code)
4407     {
4408     case WIDEN_SUM_EXPR:
4409     case DOT_PROD_EXPR:
4410     case SAD_EXPR:
4411     case PLUS_EXPR:
4412     case MINUS_EXPR:
4413     case BIT_IOR_EXPR:
4414     case BIT_XOR_EXPR:
4415     case MULT_EXPR:
4416     case BIT_AND_EXPR:
4417       {
4418         /* ADJUSTMENT_DEF is NULL when called from
4419            vect_create_epilog_for_reduction to vectorize double reduction.  */
4420         if (adjustment_def)
4421           *adjustment_def = init_val;
4422
4423         if (code == MULT_EXPR)
4424           {
4425             real_init_val = dconst1;
4426             int_init_val = 1;
4427           }
4428
4429         if (code == BIT_AND_EXPR)
4430           int_init_val = -1;
4431
4432         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4433           def_for_init = build_real (scalar_type, real_init_val);
4434         else
4435           def_for_init = build_int_cst (scalar_type, int_init_val);
4436
4437         if (adjustment_def)
4438           /* Option1: the first element is '0' or '1' as well.  */
4439           init_def = gimple_build_vector_from_val (&stmts, vectype,
4440                                                    def_for_init);
4441         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4442           {
4443             /* Option2 (variable length): the first element is INIT_VAL.  */
4444             init_def = build_vector_from_val (vectype, def_for_init);
4445             gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4446                                                       2, init_def, init_val);
4447             init_def = make_ssa_name (vectype);
4448             gimple_call_set_lhs (call, init_def);
4449             gimple_seq_add_stmt (&stmts, call);
4450           }
4451         else
4452           {
4453             /* Option2: the first element is INIT_VAL.  */
4454             tree_vector_builder elts (vectype, 1, 2);
4455             elts.quick_push (init_val);
4456             elts.quick_push (def_for_init);
4457             init_def = gimple_build_vector (&stmts, &elts);
4458           }
4459       }
4460       break;
4461
4462     case MIN_EXPR:
4463     case MAX_EXPR:
4464     case COND_EXPR:
4465       {
4466         if (adjustment_def)
4467           {
4468             *adjustment_def = NULL_TREE;
4469             if (reduction_type != COND_REDUCTION
4470                 && reduction_type != EXTRACT_LAST_REDUCTION)
4471               {
4472                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4473                 break;
4474               }
4475           }
4476         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4477         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4478       }
4479       break;
4480
4481     default:
4482       gcc_unreachable ();
4483     }
4484
4485   if (stmts)
4486     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4487   return init_def;
4488 }
4489
4490 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4491    NUMBER_OF_VECTORS is the number of vector defs to create.
4492    If NEUTRAL_OP is nonnull, introducing extra elements of that
4493    value will not change the result.  */
4494
4495 static void
4496 get_initial_defs_for_reduction (slp_tree slp_node,
4497                                 vec<tree> *vec_oprnds,
4498                                 unsigned int number_of_vectors,
4499                                 bool reduc_chain, tree neutral_op)
4500 {
4501   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4502   gimple *stmt = stmts[0];
4503   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4504   unsigned HOST_WIDE_INT nunits;
4505   unsigned j, number_of_places_left_in_vector;
4506   tree vector_type;
4507   tree vop;
4508   int group_size = stmts.length ();
4509   unsigned int vec_num, i;
4510   unsigned number_of_copies = 1;
4511   vec<tree> voprnds;
4512   voprnds.create (number_of_vectors);
4513   struct loop *loop;
4514   auto_vec<tree, 16> permute_results;
4515
4516   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4517
4518   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4519
4520   loop = (gimple_bb (stmt))->loop_father;
4521   gcc_assert (loop);
4522   edge pe = loop_preheader_edge (loop);
4523
4524   gcc_assert (!reduc_chain || neutral_op);
4525
4526   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4527      created vectors. It is greater than 1 if unrolling is performed.
4528
4529      For example, we have two scalar operands, s1 and s2 (e.g., group of
4530      strided accesses of size two), while NUNITS is four (i.e., four scalars
4531      of this type can be packed in a vector).  The output vector will contain
4532      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4533      will be 2).
4534
4535      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4536      containing the operands.
4537
4538      For example, NUNITS is four as before, and the group size is 8
4539      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4540      {s5, s6, s7, s8}.  */
4541
4542   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4543     nunits = group_size;
4544
4545   number_of_copies = nunits * number_of_vectors / group_size;
4546
4547   number_of_places_left_in_vector = nunits;
4548   bool constant_p = true;
4549   tree_vector_builder elts (vector_type, nunits, 1);
4550   elts.quick_grow (nunits);
4551   for (j = 0; j < number_of_copies; j++)
4552     {
4553       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4554         {
4555           tree op;
4556           /* Get the def before the loop.  In reduction chain we have only
4557              one initial value.  */
4558           if ((j != (number_of_copies - 1)
4559                || (reduc_chain && i != 0))
4560               && neutral_op)
4561             op = neutral_op;
4562           else
4563             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4564
4565           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4566           number_of_places_left_in_vector--;
4567           elts[number_of_places_left_in_vector] = op;
4568           if (!CONSTANT_CLASS_P (op))
4569             constant_p = false;
4570
4571           if (number_of_places_left_in_vector == 0)
4572             {
4573               gimple_seq ctor_seq = NULL;
4574               tree init;
4575               if (constant_p && !neutral_op
4576                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4577                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4578                 /* Build the vector directly from ELTS.  */
4579                 init = gimple_build_vector (&ctor_seq, &elts);
4580               else if (neutral_op)
4581                 {
4582                   /* Build a vector of the neutral value and shift the
4583                      other elements into place.  */
4584                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4585                                                        neutral_op);
4586                   int k = nunits;
4587                   while (k > 0 && elts[k - 1] == neutral_op)
4588                     k -= 1;
4589                   while (k > 0)
4590                     {
4591                       k -= 1;
4592                       gcall *call = gimple_build_call_internal
4593                         (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4594                       init = make_ssa_name (vector_type);
4595                       gimple_call_set_lhs (call, init);
4596                       gimple_seq_add_stmt (&ctor_seq, call);
4597                     }
4598                 }
4599               else
4600                 {
4601                   /* First time round, duplicate ELTS to fill the
4602                      required number of vectors, then cherry pick the
4603                      appropriate result for each iteration.  */
4604                   if (vec_oprnds->is_empty ())
4605                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4606                                               number_of_vectors,
4607                                               permute_results);
4608                   init = permute_results[number_of_vectors - j - 1];
4609                 }
4610               if (ctor_seq != NULL)
4611                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4612               voprnds.quick_push (init);
4613
4614               number_of_places_left_in_vector = nunits;
4615               elts.new_vector (vector_type, nunits, 1);
4616               elts.quick_grow (nunits);
4617               constant_p = true;
4618             }
4619         }
4620     }
4621
4622   /* Since the vectors are created in the reverse order, we should invert
4623      them.  */
4624   vec_num = voprnds.length ();
4625   for (j = vec_num; j != 0; j--)
4626     {
4627       vop = voprnds[j - 1];
4628       vec_oprnds->quick_push (vop);
4629     }
4630
4631   voprnds.release ();
4632
4633   /* In case that VF is greater than the unrolling factor needed for the SLP
4634      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4635      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4636      to replicate the vectors.  */
4637   tree neutral_vec = NULL;
4638   while (number_of_vectors > vec_oprnds->length ())
4639     {
4640       if (neutral_op)
4641         {
4642           if (!neutral_vec)
4643             {
4644               gimple_seq ctor_seq = NULL;
4645               neutral_vec = gimple_build_vector_from_val
4646                 (&ctor_seq, vector_type, neutral_op);
4647               if (ctor_seq != NULL)
4648                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4649             }
4650           vec_oprnds->quick_push (neutral_vec);
4651         }
4652       else
4653         {
4654           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4655             vec_oprnds->quick_push (vop);
4656         }
4657     }
4658 }
4659
4660
4661 /* Function vect_create_epilog_for_reduction
4662
4663    Create code at the loop-epilog to finalize the result of a reduction
4664    computation.
4665
4666    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4667      reduction statements.
4668    STMT is the scalar reduction stmt that is being vectorized.
4669    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4670      number of elements that we can fit in a vectype (nunits).  In this case
4671      we have to generate more than one vector stmt - i.e - we need to "unroll"
4672      the vector stmt by a factor VF/nunits.  For more details see documentation
4673      in vectorizable_operation.
4674    REDUC_FN is the internal function for the epilog reduction.
4675    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4676      computation.
4677    REDUC_INDEX is the index of the operand in the right hand side of the
4678      statement that is defined by REDUCTION_PHI.
4679    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4680    SLP_NODE is an SLP node containing a group of reduction statements. The
4681      first one in this group is STMT.
4682    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4683      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4684      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4685      any value of the IV in the loop.
4686    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4687    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4688      null if this is not an SLP reduction
4689
4690    This function:
4691    1. Creates the reduction def-use cycles: sets the arguments for
4692       REDUCTION_PHIS:
4693       The loop-entry argument is the vectorized initial-value of the reduction.
4694       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4695       sums.
4696    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4697       by calling the function specified by REDUC_FN if available, or by
4698       other means (whole-vector shifts or a scalar loop).
4699       The function also creates a new phi node at the loop exit to preserve
4700       loop-closed form, as illustrated below.
4701
4702      The flow at the entry to this function:
4703
4704         loop:
4705           vec_def = phi <null, null>            # REDUCTION_PHI
4706           VECT_DEF = vector_stmt                # vectorized form of STMT
4707           s_loop = scalar_stmt                  # (scalar) STMT
4708         loop_exit:
4709           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4710           use <s_out0>
4711           use <s_out0>
4712
4713      The above is transformed by this function into:
4714
4715         loop:
4716           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4717           VECT_DEF = vector_stmt                # vectorized form of STMT
4718           s_loop = scalar_stmt                  # (scalar) STMT
4719         loop_exit:
4720           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4721           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4722           v_out2 = reduce <v_out1>
4723           s_out3 = extract_field <v_out2, 0>
4724           s_out4 = adjust_result <s_out3>
4725           use <s_out4>
4726           use <s_out4>
4727 */
4728
4729 static void
4730 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4731                                   gimple *reduc_def_stmt,
4732                                   int ncopies, internal_fn reduc_fn,
4733                                   vec<gimple *> reduction_phis,
4734                                   bool double_reduc,
4735                                   slp_tree slp_node,
4736                                   slp_instance slp_node_instance,
4737                                   tree induc_val, enum tree_code induc_code,
4738                                   tree neutral_op)
4739 {
4740   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4741   stmt_vec_info prev_phi_info;
4742   tree vectype;
4743   machine_mode mode;
4744   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4745   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4746   basic_block exit_bb;
4747   tree scalar_dest;
4748   tree scalar_type;
4749   gimple *new_phi = NULL, *phi;
4750   gimple_stmt_iterator exit_gsi;
4751   tree vec_dest;
4752   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4753   gimple *epilog_stmt = NULL;
4754   enum tree_code code = gimple_assign_rhs_code (stmt);
4755   gimple *exit_phi;
4756   tree bitsize;
4757   tree adjustment_def = NULL;
4758   tree vec_initial_def = NULL;
4759   tree expr, def, initial_def = NULL;
4760   tree orig_name, scalar_result;
4761   imm_use_iterator imm_iter, phi_imm_iter;
4762   use_operand_p use_p, phi_use_p;
4763   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4764   bool nested_in_vect_loop = false;
4765   auto_vec<gimple *> new_phis;
4766   auto_vec<gimple *> inner_phis;
4767   enum vect_def_type dt = vect_unknown_def_type;
4768   int j, i;
4769   auto_vec<tree> scalar_results;
4770   unsigned int group_size = 1, k, ratio;
4771   auto_vec<tree> vec_initial_defs;
4772   auto_vec<gimple *> phis;
4773   bool slp_reduc = false;
4774   bool direct_slp_reduc;
4775   tree new_phi_result;
4776   gimple *inner_phi = NULL;
4777   tree induction_index = NULL_TREE;
4778
4779   if (slp_node)
4780     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4781
4782   if (nested_in_vect_loop_p (loop, stmt))
4783     {
4784       outer_loop = loop;
4785       loop = loop->inner;
4786       nested_in_vect_loop = true;
4787       gcc_assert (!slp_node);
4788     }
4789
4790   vectype = STMT_VINFO_VECTYPE (stmt_info);
4791   gcc_assert (vectype);
4792   mode = TYPE_MODE (vectype);
4793
4794   /* 1. Create the reduction def-use cycle:
4795      Set the arguments of REDUCTION_PHIS, i.e., transform
4796
4797         loop:
4798           vec_def = phi <null, null>            # REDUCTION_PHI
4799           VECT_DEF = vector_stmt                # vectorized form of STMT
4800           ...
4801
4802      into:
4803
4804         loop:
4805           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4806           VECT_DEF = vector_stmt                # vectorized form of STMT
4807           ...
4808
4809      (in case of SLP, do it for all the phis). */
4810
4811   /* Get the loop-entry arguments.  */
4812   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4813   if (slp_node)
4814     {
4815       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4816       vec_initial_defs.reserve (vec_num);
4817       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4818                                       &vec_initial_defs, vec_num,
4819                                       GROUP_FIRST_ELEMENT (stmt_info),
4820                                       neutral_op);
4821     }
4822   else
4823     {
4824       /* Get at the scalar def before the loop, that defines the initial value
4825          of the reduction variable.  */
4826       gimple *def_stmt;
4827       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4828                                            loop_preheader_edge (loop));
4829       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4830          and we can't use zero for induc_val, use initial_def.  Similarly
4831          for REDUC_MIN and initial_def larger than the base.  */
4832       if (TREE_CODE (initial_def) == INTEGER_CST
4833           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4834               == INTEGER_INDUC_COND_REDUCTION)
4835           && !integer_zerop (induc_val)
4836           && ((induc_code == MAX_EXPR
4837                && tree_int_cst_lt (initial_def, induc_val))
4838               || (induc_code == MIN_EXPR
4839                   && tree_int_cst_lt (induc_val, initial_def))))
4840         induc_val = initial_def;
4841       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4842       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4843                                                        &adjustment_def);
4844       vec_initial_defs.create (1);
4845       vec_initial_defs.quick_push (vec_initial_def);
4846     }
4847
4848   /* Set phi nodes arguments.  */
4849   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4850     {
4851       tree vec_init_def = vec_initial_defs[i];
4852       tree def = vect_defs[i];
4853       for (j = 0; j < ncopies; j++)
4854         {
4855           if (j != 0)
4856             {
4857               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4858               if (nested_in_vect_loop)
4859                 vec_init_def
4860                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4861                                                     vec_init_def);
4862             }
4863
4864           /* Set the loop-entry arg of the reduction-phi.  */
4865
4866           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4867               == INTEGER_INDUC_COND_REDUCTION)
4868             {
4869               /* Initialise the reduction phi to zero.  This prevents initial
4870                  values of non-zero interferring with the reduction op.  */
4871               gcc_assert (ncopies == 1);
4872               gcc_assert (i == 0);
4873
4874               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4875               tree induc_val_vec
4876                 = build_vector_from_val (vec_init_def_type, induc_val);
4877
4878               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4879                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4880             }
4881           else
4882             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4883                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4884
4885           /* Set the loop-latch arg for the reduction-phi.  */
4886           if (j > 0)
4887             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4888
4889           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4890                        UNKNOWN_LOCATION);
4891
4892           if (dump_enabled_p ())
4893             {
4894               dump_printf_loc (MSG_NOTE, vect_location,
4895                                "transform reduction: created def-use cycle: ");
4896               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4897               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4898             }
4899         }
4900     }
4901
4902   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4903      which is updated with the current index of the loop for every match of
4904      the original loop's cond_expr (VEC_STMT).  This results in a vector
4905      containing the last time the condition passed for that vector lane.
4906      The first match will be a 1 to allow 0 to be used for non-matching
4907      indexes.  If there are no matches at all then the vector will be all
4908      zeroes.  */
4909   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4910     {
4911       tree indx_before_incr, indx_after_incr;
4912       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4913
4914       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4915       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4916
4917       int scalar_precision
4918         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4919       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4920       tree cr_index_vector_type = build_vector_type
4921         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4922
4923       /* First we create a simple vector induction variable which starts
4924          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4925          vector size (STEP).  */
4926
4927       /* Create a {1,2,3,...} vector.  */
4928       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4929
4930       /* Create a vector of the step value.  */
4931       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4932       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4933
4934       /* Create an induction variable.  */
4935       gimple_stmt_iterator incr_gsi;
4936       bool insert_after;
4937       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4938       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4939                  insert_after, &indx_before_incr, &indx_after_incr);
4940
4941       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4942          filled with zeros (VEC_ZERO).  */
4943
4944       /* Create a vector of 0s.  */
4945       tree zero = build_zero_cst (cr_index_scalar_type);
4946       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4947
4948       /* Create a vector phi node.  */
4949       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4950       new_phi = create_phi_node (new_phi_tree, loop->header);
4951       set_vinfo_for_stmt (new_phi,
4952                           new_stmt_vec_info (new_phi, loop_vinfo));
4953       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4954                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4955
4956       /* Now take the condition from the loops original cond_expr
4957          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4958          every match uses values from the induction variable
4959          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4960          (NEW_PHI_TREE).
4961          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4962          the new cond_expr (INDEX_COND_EXPR).  */
4963
4964       /* Duplicate the condition from vec_stmt.  */
4965       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4966
4967       /* Create a conditional, where the condition is taken from vec_stmt
4968          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4969          else is the phi (NEW_PHI_TREE).  */
4970       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4971                                      ccompare, indx_before_incr,
4972                                      new_phi_tree);
4973       induction_index = make_ssa_name (cr_index_vector_type);
4974       gimple *index_condition = gimple_build_assign (induction_index,
4975                                                      index_cond_expr);
4976       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4977       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4978                                                         loop_vinfo);
4979       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4980       set_vinfo_for_stmt (index_condition, index_vec_info);
4981
4982       /* Update the phi with the vec cond.  */
4983       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4984                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4985     }
4986
4987   /* 2. Create epilog code.
4988         The reduction epilog code operates across the elements of the vector
4989         of partial results computed by the vectorized loop.
4990         The reduction epilog code consists of:
4991
4992         step 1: compute the scalar result in a vector (v_out2)
4993         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4994         step 3: adjust the scalar result (s_out3) if needed.
4995
4996         Step 1 can be accomplished using one the following three schemes:
4997           (scheme 1) using reduc_fn, if available.
4998           (scheme 2) using whole-vector shifts, if available.
4999           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5000                      combined.
5001
5002           The overall epilog code looks like this:
5003
5004           s_out0 = phi <s_loop>         # original EXIT_PHI
5005           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5006           v_out2 = reduce <v_out1>              # step 1
5007           s_out3 = extract_field <v_out2, 0>    # step 2
5008           s_out4 = adjust_result <s_out3>       # step 3
5009
5010           (step 3 is optional, and steps 1 and 2 may be combined).
5011           Lastly, the uses of s_out0 are replaced by s_out4.  */
5012
5013
5014   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5015          v_out1 = phi <VECT_DEF>
5016          Store them in NEW_PHIS.  */
5017
5018   exit_bb = single_exit (loop)->dest;
5019   prev_phi_info = NULL;
5020   new_phis.create (vect_defs.length ());
5021   FOR_EACH_VEC_ELT (vect_defs, i, def)
5022     {
5023       for (j = 0; j < ncopies; j++)
5024         {
5025           tree new_def = copy_ssa_name (def);
5026           phi = create_phi_node (new_def, exit_bb);
5027           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5028           if (j == 0)
5029             new_phis.quick_push (phi);
5030           else
5031             {
5032               def = vect_get_vec_def_for_stmt_copy (dt, def);
5033               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5034             }
5035
5036           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5037           prev_phi_info = vinfo_for_stmt (phi);
5038         }
5039     }
5040
5041   /* The epilogue is created for the outer-loop, i.e., for the loop being
5042      vectorized.  Create exit phis for the outer loop.  */
5043   if (double_reduc)
5044     {
5045       loop = outer_loop;
5046       exit_bb = single_exit (loop)->dest;
5047       inner_phis.create (vect_defs.length ());
5048       FOR_EACH_VEC_ELT (new_phis, i, phi)
5049         {
5050           tree new_result = copy_ssa_name (PHI_RESULT (phi));
5051           gphi *outer_phi = create_phi_node (new_result, exit_bb);
5052           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5053                            PHI_RESULT (phi));
5054           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5055                                                             loop_vinfo));
5056           inner_phis.quick_push (phi);
5057           new_phis[i] = outer_phi;
5058           prev_phi_info = vinfo_for_stmt (outer_phi);
5059           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5060             {
5061               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5062               new_result = copy_ssa_name (PHI_RESULT (phi));
5063               outer_phi = create_phi_node (new_result, exit_bb);
5064               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5065                                PHI_RESULT (phi));
5066               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5067                                                                 loop_vinfo));
5068               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5069               prev_phi_info = vinfo_for_stmt (outer_phi);
5070             }
5071         }
5072     }
5073
5074   exit_gsi = gsi_after_labels (exit_bb);
5075
5076   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5077          (i.e. when reduc_fn is not available) and in the final adjustment
5078          code (if needed).  Also get the original scalar reduction variable as
5079          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5080          represents a reduction pattern), the tree-code and scalar-def are
5081          taken from the original stmt that the pattern-stmt (STMT) replaces.
5082          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5083          are taken from STMT.  */
5084
5085   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5086   if (!orig_stmt)
5087     {
5088       /* Regular reduction  */
5089       orig_stmt = stmt;
5090     }
5091   else
5092     {
5093       /* Reduction pattern  */
5094       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5095       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5096       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5097     }
5098
5099   code = gimple_assign_rhs_code (orig_stmt);
5100   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5101      partial results are added and not subtracted.  */
5102   if (code == MINUS_EXPR)
5103     code = PLUS_EXPR;
5104
5105   scalar_dest = gimple_assign_lhs (orig_stmt);
5106   scalar_type = TREE_TYPE (scalar_dest);
5107   scalar_results.create (group_size);
5108   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5109   bitsize = TYPE_SIZE (scalar_type);
5110
5111   /* In case this is a reduction in an inner-loop while vectorizing an outer
5112      loop - we don't need to extract a single scalar result at the end of the
5113      inner-loop (unless it is double reduction, i.e., the use of reduction is
5114      outside the outer-loop).  The final vector of partial results will be used
5115      in the vectorized outer-loop, or reduced to a scalar result at the end of
5116      the outer-loop.  */
5117   if (nested_in_vect_loop && !double_reduc)
5118     goto vect_finalize_reduction;
5119
5120   /* SLP reduction without reduction chain, e.g.,
5121      # a1 = phi <a2, a0>
5122      # b1 = phi <b2, b0>
5123      a2 = operation (a1)
5124      b2 = operation (b1)  */
5125   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5126
5127   /* True if we should implement SLP_REDUC using native reduction operations
5128      instead of scalar operations.  */
5129   direct_slp_reduc = (reduc_fn != IFN_LAST
5130                       && slp_reduc
5131                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5132
5133   /* In case of reduction chain, e.g.,
5134      # a1 = phi <a3, a0>
5135      a2 = operation (a1)
5136      a3 = operation (a2),
5137
5138      we may end up with more than one vector result.  Here we reduce them to
5139      one vector.  */
5140   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5141     {
5142       tree first_vect = PHI_RESULT (new_phis[0]);
5143       gassign *new_vec_stmt = NULL;
5144       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5145       for (k = 1; k < new_phis.length (); k++)
5146         {
5147           gimple *next_phi = new_phis[k];
5148           tree second_vect = PHI_RESULT (next_phi);
5149           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5150           new_vec_stmt = gimple_build_assign (tem, code,
5151                                               first_vect, second_vect);
5152           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5153           first_vect = tem;
5154         }
5155
5156       new_phi_result = first_vect;
5157       if (new_vec_stmt)
5158         {
5159           new_phis.truncate (0);
5160           new_phis.safe_push (new_vec_stmt);
5161         }
5162     }
5163   /* Likewise if we couldn't use a single defuse cycle.  */
5164   else if (ncopies > 1)
5165     {
5166       gcc_assert (new_phis.length () == 1);
5167       tree first_vect = PHI_RESULT (new_phis[0]);
5168       gassign *new_vec_stmt = NULL;
5169       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5170       gimple *next_phi = new_phis[0];
5171       for (int k = 1; k < ncopies; ++k)
5172         {
5173           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5174           tree second_vect = PHI_RESULT (next_phi);
5175           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5176           new_vec_stmt = gimple_build_assign (tem, code,
5177                                               first_vect, second_vect);
5178           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5179           first_vect = tem;
5180         }
5181       new_phi_result = first_vect;
5182       new_phis.truncate (0);
5183       new_phis.safe_push (new_vec_stmt);
5184     }
5185   else
5186     new_phi_result = PHI_RESULT (new_phis[0]);
5187
5188   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5189       && reduc_fn != IFN_LAST)
5190     {
5191       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5192          various data values where the condition matched and another vector
5193          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5194          need to extract the last matching index (which will be the index with
5195          highest value) and use this to index into the data vector.
5196          For the case where there were no matches, the data vector will contain
5197          all default values and the index vector will be all zeros.  */
5198
5199       /* Get various versions of the type of the vector of indexes.  */
5200       tree index_vec_type = TREE_TYPE (induction_index);
5201       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5202       tree index_scalar_type = TREE_TYPE (index_vec_type);
5203       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5204         (index_vec_type);
5205
5206       /* Get an unsigned integer version of the type of the data vector.  */
5207       int scalar_precision
5208         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5209       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5210       tree vectype_unsigned = build_vector_type
5211         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5212
5213       /* First we need to create a vector (ZERO_VEC) of zeros and another
5214          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5215          can create using a MAX reduction and then expanding.
5216          In the case where the loop never made any matches, the max index will
5217          be zero.  */
5218
5219       /* Vector of {0, 0, 0,...}.  */
5220       tree zero_vec = make_ssa_name (vectype);
5221       tree zero_vec_rhs = build_zero_cst (vectype);
5222       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5223       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5224
5225       /* Find maximum value from the vector of found indexes.  */
5226       tree max_index = make_ssa_name (index_scalar_type);
5227       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5228                                                           1, induction_index);
5229       gimple_call_set_lhs (max_index_stmt, max_index);
5230       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5231
5232       /* Vector of {max_index, max_index, max_index,...}.  */
5233       tree max_index_vec = make_ssa_name (index_vec_type);
5234       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5235                                                       max_index);
5236       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5237                                                         max_index_vec_rhs);
5238       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5239
5240       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5241          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5242          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5243          otherwise.  Only one value should match, resulting in a vector
5244          (VEC_COND) with one data value and the rest zeros.
5245          In the case where the loop never made any matches, every index will
5246          match, resulting in a vector with all data values (which will all be
5247          the default value).  */
5248
5249       /* Compare the max index vector to the vector of found indexes to find
5250          the position of the max value.  */
5251       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5252       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5253                                                       induction_index,
5254                                                       max_index_vec);
5255       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5256
5257       /* Use the compare to choose either values from the data vector or
5258          zero.  */
5259       tree vec_cond = make_ssa_name (vectype);
5260       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5261                                                    vec_compare, new_phi_result,
5262                                                    zero_vec);
5263       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5264
5265       /* Finally we need to extract the data value from the vector (VEC_COND)
5266          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5267          reduction, but because this doesn't exist, we can use a MAX reduction
5268          instead.  The data value might be signed or a float so we need to cast
5269          it first.
5270          In the case where the loop never made any matches, the data values are
5271          all identical, and so will reduce down correctly.  */
5272
5273       /* Make the matched data values unsigned.  */
5274       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5275       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5276                                        vec_cond);
5277       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5278                                                         VIEW_CONVERT_EXPR,
5279                                                         vec_cond_cast_rhs);
5280       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5281
5282       /* Reduce down to a scalar value.  */
5283       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5284       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5285                                                            1, vec_cond_cast);
5286       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5287       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5288
5289       /* Convert the reduced value back to the result type and set as the
5290          result.  */
5291       gimple_seq stmts = NULL;
5292       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5293                                data_reduc);
5294       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5295       scalar_results.safe_push (new_temp);
5296     }
5297   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5298            && reduc_fn == IFN_LAST)
5299     {
5300       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5301          idx = 0;
5302          idx_val = induction_index[0];
5303          val = data_reduc[0];
5304          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5305            if (induction_index[i] > idx_val)
5306              val = data_reduc[i], idx_val = induction_index[i];
5307          return val;  */
5308
5309       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5310       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5311       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5312       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5313       /* Enforced by vectorizable_reduction, which ensures we have target
5314          support before allowing a conditional reduction on variable-length
5315          vectors.  */
5316       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5317       tree idx_val = NULL_TREE, val = NULL_TREE;
5318       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5319         {
5320           tree old_idx_val = idx_val;
5321           tree old_val = val;
5322           idx_val = make_ssa_name (idx_eltype);
5323           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5324                                              build3 (BIT_FIELD_REF, idx_eltype,
5325                                                      induction_index,
5326                                                      bitsize_int (el_size),
5327                                                      bitsize_int (off)));
5328           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5329           val = make_ssa_name (data_eltype);
5330           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5331                                              build3 (BIT_FIELD_REF,
5332                                                      data_eltype,
5333                                                      new_phi_result,
5334                                                      bitsize_int (el_size),
5335                                                      bitsize_int (off)));
5336           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5337           if (off != 0)
5338             {
5339               tree new_idx_val = idx_val;
5340               tree new_val = val;
5341               if (off != v_size - el_size)
5342                 {
5343                   new_idx_val = make_ssa_name (idx_eltype);
5344                   epilog_stmt = gimple_build_assign (new_idx_val,
5345                                                      MAX_EXPR, idx_val,
5346                                                      old_idx_val);
5347                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5348                 }
5349               new_val = make_ssa_name (data_eltype);
5350               epilog_stmt = gimple_build_assign (new_val,
5351                                                  COND_EXPR,
5352                                                  build2 (GT_EXPR,
5353                                                          boolean_type_node,
5354                                                          idx_val,
5355                                                          old_idx_val),
5356                                                  val, old_val);
5357               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5358               idx_val = new_idx_val;
5359               val = new_val;
5360             }
5361         }
5362       /* Convert the reduced value back to the result type and set as the
5363          result.  */
5364       gimple_seq stmts = NULL;
5365       val = gimple_convert (&stmts, scalar_type, val);
5366       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5367       scalar_results.safe_push (val);
5368     }
5369
5370   /* 2.3 Create the reduction code, using one of the three schemes described
5371          above. In SLP we simply need to extract all the elements from the
5372          vector (without reducing them), so we use scalar shifts.  */
5373   else if (reduc_fn != IFN_LAST && !slp_reduc)
5374     {
5375       tree tmp;
5376       tree vec_elem_type;
5377
5378       /* Case 1:  Create:
5379          v_out2 = reduc_expr <v_out1>  */
5380
5381       if (dump_enabled_p ())
5382         dump_printf_loc (MSG_NOTE, vect_location,
5383                          "Reduce using direct vector reduction.\n");
5384
5385       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5386       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5387         {
5388           tree tmp_dest
5389             = vect_create_destination_var (scalar_dest, vec_elem_type);
5390           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5391                                                     new_phi_result);
5392           gimple_set_lhs (epilog_stmt, tmp_dest);
5393           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5394           gimple_set_lhs (epilog_stmt, new_temp);
5395           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5396
5397           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5398                                              new_temp);
5399         }
5400       else
5401         {
5402           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5403                                                     new_phi_result);
5404           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5405         }
5406
5407       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5408       gimple_set_lhs (epilog_stmt, new_temp);
5409       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5410
5411       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5412            == INTEGER_INDUC_COND_REDUCTION)
5413           && !operand_equal_p (initial_def, induc_val, 0))
5414         {
5415           /* Earlier we set the initial value to be a vector if induc_val
5416              values.  Check the result and if it is induc_val then replace
5417              with the original initial value, unless induc_val is
5418              the same as initial_def already.  */
5419           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5420                                   induc_val);
5421
5422           tmp = make_ssa_name (new_scalar_dest);
5423           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5424                                              initial_def, new_temp);
5425           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5426           new_temp = tmp;
5427         }
5428
5429       scalar_results.safe_push (new_temp);
5430     }
5431   else if (direct_slp_reduc)
5432     {
5433       /* Here we create one vector for each of the GROUP_SIZE results,
5434          with the elements for other SLP statements replaced with the
5435          neutral value.  We can then do a normal reduction on each vector.  */
5436
5437       /* Enforced by vectorizable_reduction.  */
5438       gcc_assert (new_phis.length () == 1);
5439       gcc_assert (pow2p_hwi (group_size));
5440
5441       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5442       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5443       gimple_seq seq = NULL;
5444
5445       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5446          and the same element size as VECTYPE.  */
5447       tree index = build_index_vector (vectype, 0, 1);
5448       tree index_type = TREE_TYPE (index);
5449       tree index_elt_type = TREE_TYPE (index_type);
5450       tree mask_type = build_same_sized_truth_vector_type (index_type);
5451
5452       /* Create a vector that, for each element, identifies which of
5453          the GROUP_SIZE results should use it.  */
5454       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5455       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5456                             build_vector_from_val (index_type, index_mask));
5457
5458       /* Get a neutral vector value.  This is simply a splat of the neutral
5459          scalar value if we have one, otherwise the initial scalar value
5460          is itself a neutral value.  */
5461       tree vector_identity = NULL_TREE;
5462       if (neutral_op)
5463         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5464                                                         neutral_op);
5465       for (unsigned int i = 0; i < group_size; ++i)
5466         {
5467           /* If there's no univeral neutral value, we can use the
5468              initial scalar value from the original PHI.  This is used
5469              for MIN and MAX reduction, for example.  */
5470           if (!neutral_op)
5471             {
5472               tree scalar_value
5473                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5474                                          loop_preheader_edge (loop));
5475               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5476                                                               scalar_value);
5477             }
5478
5479           /* Calculate the equivalent of:
5480
5481              sel[j] = (index[j] == i);
5482
5483              which selects the elements of NEW_PHI_RESULT that should
5484              be included in the result.  */
5485           tree compare_val = build_int_cst (index_elt_type, i);
5486           compare_val = build_vector_from_val (index_type, compare_val);
5487           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5488                                    index, compare_val);
5489
5490           /* Calculate the equivalent of:
5491
5492              vec = seq ? new_phi_result : vector_identity;
5493
5494              VEC is now suitable for a full vector reduction.  */
5495           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5496                                    sel, new_phi_result, vector_identity);
5497
5498           /* Do the reduction and convert it to the appropriate type.  */
5499           gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5500           tree scalar = make_ssa_name (TREE_TYPE (vectype));
5501           gimple_call_set_lhs (call, scalar);
5502           gimple_seq_add_stmt (&seq, call);
5503           scalar = gimple_convert (&seq, scalar_type, scalar);
5504           scalar_results.safe_push (scalar);
5505         }
5506       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5507     }
5508   else
5509     {
5510       bool reduce_with_shift;
5511       tree vec_temp;
5512
5513       /* COND reductions all do the final reduction with MAX_EXPR
5514          or MIN_EXPR.  */
5515       if (code == COND_EXPR)
5516         {
5517           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5518               == INTEGER_INDUC_COND_REDUCTION)
5519             code = induc_code;
5520           else
5521             code = MAX_EXPR;
5522         }
5523
5524       /* See if the target wants to do the final (shift) reduction
5525          in a vector mode of smaller size and first reduce upper/lower
5526          halves against each other.  */
5527       enum machine_mode mode1 = mode;
5528       tree vectype1 = vectype;
5529       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5530       unsigned sz1 = sz;
5531       if (!slp_reduc
5532           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5533         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5534
5535       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5536       reduce_with_shift = have_whole_vector_shift (mode1);
5537       if (!VECTOR_MODE_P (mode1))
5538         reduce_with_shift = false;
5539       else
5540         {
5541           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5542           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5543             reduce_with_shift = false;
5544         }
5545
5546       /* First reduce the vector to the desired vector size we should
5547          do shift reduction on by combining upper and lower halves.  */
5548       new_temp = new_phi_result;
5549       while (sz > sz1)
5550         {
5551           gcc_assert (!slp_reduc);
5552           sz /= 2;
5553           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5554
5555           /* The target has to make sure we support lowpart/highpart
5556              extraction, either via direct vector extract or through
5557              an integer mode punning.  */
5558           tree dst1, dst2;
5559           if (convert_optab_handler (vec_extract_optab,
5560                                      TYPE_MODE (TREE_TYPE (new_temp)),
5561                                      TYPE_MODE (vectype1))
5562               != CODE_FOR_nothing)
5563             {
5564               /* Extract sub-vectors directly once vec_extract becomes
5565                  a conversion optab.  */
5566               dst1 = make_ssa_name (vectype1);
5567               epilog_stmt
5568                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5569                                          build3 (BIT_FIELD_REF, vectype1,
5570                                                  new_temp, TYPE_SIZE (vectype1),
5571                                                  bitsize_int (0)));
5572               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5573               dst2 =  make_ssa_name (vectype1);
5574               epilog_stmt
5575                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5576                                          build3 (BIT_FIELD_REF, vectype1,
5577                                                  new_temp, TYPE_SIZE (vectype1),
5578                                                  bitsize_int (sz * BITS_PER_UNIT)));
5579               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5580             }
5581           else
5582             {
5583               /* Extract via punning to appropriately sized integer mode
5584                  vector.  */
5585               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5586                                                             1);
5587               tree etype = build_vector_type (eltype, 2);
5588               gcc_assert (convert_optab_handler (vec_extract_optab,
5589                                                  TYPE_MODE (etype),
5590                                                  TYPE_MODE (eltype))
5591                           != CODE_FOR_nothing);
5592               tree tem = make_ssa_name (etype);
5593               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5594                                                  build1 (VIEW_CONVERT_EXPR,
5595                                                          etype, new_temp));
5596               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5597               new_temp = tem;
5598               tem = make_ssa_name (eltype);
5599               epilog_stmt
5600                   = gimple_build_assign (tem, BIT_FIELD_REF,
5601                                          build3 (BIT_FIELD_REF, eltype,
5602                                                  new_temp, TYPE_SIZE (eltype),
5603                                                  bitsize_int (0)));
5604               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5605               dst1 = make_ssa_name (vectype1);
5606               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5607                                                  build1 (VIEW_CONVERT_EXPR,
5608                                                          vectype1, tem));
5609               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5610               tem = make_ssa_name (eltype);
5611               epilog_stmt
5612                   = gimple_build_assign (tem, BIT_FIELD_REF,
5613                                          build3 (BIT_FIELD_REF, eltype,
5614                                                  new_temp, TYPE_SIZE (eltype),
5615                                                  bitsize_int (sz * BITS_PER_UNIT)));
5616               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5617               dst2 =  make_ssa_name (vectype1);
5618               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5619                                                  build1 (VIEW_CONVERT_EXPR,
5620                                                          vectype1, tem));
5621               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5622             }
5623
5624           new_temp = make_ssa_name (vectype1);
5625           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5626           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5627         }
5628
5629       if (reduce_with_shift && !slp_reduc)
5630         {
5631           int element_bitsize = tree_to_uhwi (bitsize);
5632           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5633              for variable-length vectors and also requires direct target support
5634              for loop reductions.  */
5635           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5636           int nelements = vec_size_in_bits / element_bitsize;
5637           vec_perm_builder sel;
5638           vec_perm_indices indices;
5639
5640           int elt_offset;
5641
5642           tree zero_vec = build_zero_cst (vectype1);
5643           /* Case 2: Create:
5644              for (offset = nelements/2; offset >= 1; offset/=2)
5645                 {
5646                   Create:  va' = vec_shift <va, offset>
5647                   Create:  va = vop <va, va'>
5648                 }  */
5649
5650           tree rhs;
5651
5652           if (dump_enabled_p ())
5653             dump_printf_loc (MSG_NOTE, vect_location,
5654                              "Reduce using vector shifts\n");
5655
5656           mode1 = TYPE_MODE (vectype1);
5657           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5658           for (elt_offset = nelements / 2;
5659                elt_offset >= 1;
5660                elt_offset /= 2)
5661             {
5662               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5663               indices.new_vector (sel, 2, nelements);
5664               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5665               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5666                                                  new_temp, zero_vec, mask);
5667               new_name = make_ssa_name (vec_dest, epilog_stmt);
5668               gimple_assign_set_lhs (epilog_stmt, new_name);
5669               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5670
5671               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5672                                                  new_temp);
5673               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5674               gimple_assign_set_lhs (epilog_stmt, new_temp);
5675               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5676             }
5677
5678           /* 2.4  Extract the final scalar result.  Create:
5679              s_out3 = extract_field <v_out2, bitpos>  */
5680
5681           if (dump_enabled_p ())
5682             dump_printf_loc (MSG_NOTE, vect_location,
5683                              "extract scalar result\n");
5684
5685           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5686                         bitsize, bitsize_zero_node);
5687           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5688           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5689           gimple_assign_set_lhs (epilog_stmt, new_temp);
5690           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5691           scalar_results.safe_push (new_temp);
5692         }
5693       else
5694         {
5695           /* Case 3: Create:
5696              s = extract_field <v_out2, 0>
5697              for (offset = element_size;
5698                   offset < vector_size;
5699                   offset += element_size;)
5700                {
5701                  Create:  s' = extract_field <v_out2, offset>
5702                  Create:  s = op <s, s'>  // For non SLP cases
5703                }  */
5704
5705           if (dump_enabled_p ())
5706             dump_printf_loc (MSG_NOTE, vect_location,
5707                              "Reduce using scalar code.\n");
5708
5709           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5710           int element_bitsize = tree_to_uhwi (bitsize);
5711           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5712             {
5713               int bit_offset;
5714               if (gimple_code (new_phi) == GIMPLE_PHI)
5715                 vec_temp = PHI_RESULT (new_phi);
5716               else
5717                 vec_temp = gimple_assign_lhs (new_phi);
5718               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5719                                  bitsize_zero_node);
5720               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5721               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5722               gimple_assign_set_lhs (epilog_stmt, new_temp);
5723               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5724
5725               /* In SLP we don't need to apply reduction operation, so we just
5726                  collect s' values in SCALAR_RESULTS.  */
5727               if (slp_reduc)
5728                 scalar_results.safe_push (new_temp);
5729
5730               for (bit_offset = element_bitsize;
5731                    bit_offset < vec_size_in_bits;
5732                    bit_offset += element_bitsize)
5733                 {
5734                   tree bitpos = bitsize_int (bit_offset);
5735                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5736                                      bitsize, bitpos);
5737
5738                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5739                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5740                   gimple_assign_set_lhs (epilog_stmt, new_name);
5741                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5742
5743                   if (slp_reduc)
5744                     {
5745                       /* In SLP we don't need to apply reduction operation, so
5746                          we just collect s' values in SCALAR_RESULTS.  */
5747                       new_temp = new_name;
5748                       scalar_results.safe_push (new_name);
5749                     }
5750                   else
5751                     {
5752                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5753                                                          new_name, new_temp);
5754                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5755                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5756                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5757                     }
5758                 }
5759             }
5760
5761           /* The only case where we need to reduce scalar results in SLP, is
5762              unrolling.  If the size of SCALAR_RESULTS is greater than
5763              GROUP_SIZE, we reduce them combining elements modulo
5764              GROUP_SIZE.  */
5765           if (slp_reduc)
5766             {
5767               tree res, first_res, new_res;
5768               gimple *new_stmt;
5769
5770               /* Reduce multiple scalar results in case of SLP unrolling.  */
5771               for (j = group_size; scalar_results.iterate (j, &res);
5772                    j++)
5773                 {
5774                   first_res = scalar_results[j % group_size];
5775                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5776                                                   first_res, res);
5777                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5778                   gimple_assign_set_lhs (new_stmt, new_res);
5779                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5780                   scalar_results[j % group_size] = new_res;
5781                 }
5782             }
5783           else
5784             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5785             scalar_results.safe_push (new_temp);
5786         }
5787
5788       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5789            == INTEGER_INDUC_COND_REDUCTION)
5790           && !operand_equal_p (initial_def, induc_val, 0))
5791         {
5792           /* Earlier we set the initial value to be a vector if induc_val
5793              values.  Check the result and if it is induc_val then replace
5794              with the original initial value, unless induc_val is
5795              the same as initial_def already.  */
5796           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5797                                   induc_val);
5798
5799           tree tmp = make_ssa_name (new_scalar_dest);
5800           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5801                                              initial_def, new_temp);
5802           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5803           scalar_results[0] = tmp;
5804         }
5805     }
5806
5807 vect_finalize_reduction:
5808
5809   if (double_reduc)
5810     loop = loop->inner;
5811
5812   /* 2.5 Adjust the final result by the initial value of the reduction
5813          variable. (When such adjustment is not needed, then
5814          'adjustment_def' is zero).  For example, if code is PLUS we create:
5815          new_temp = loop_exit_def + adjustment_def  */
5816
5817   if (adjustment_def)
5818     {
5819       gcc_assert (!slp_reduc);
5820       if (nested_in_vect_loop)
5821         {
5822           new_phi = new_phis[0];
5823           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5824           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5825           new_dest = vect_create_destination_var (scalar_dest, vectype);
5826         }
5827       else
5828         {
5829           new_temp = scalar_results[0];
5830           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5831           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5832           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5833         }
5834
5835       epilog_stmt = gimple_build_assign (new_dest, expr);
5836       new_temp = make_ssa_name (new_dest, epilog_stmt);
5837       gimple_assign_set_lhs (epilog_stmt, new_temp);
5838       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5839       if (nested_in_vect_loop)
5840         {
5841           set_vinfo_for_stmt (epilog_stmt,
5842                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5843           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5844                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5845
5846           if (!double_reduc)
5847             scalar_results.quick_push (new_temp);
5848           else
5849             scalar_results[0] = new_temp;
5850         }
5851       else
5852         scalar_results[0] = new_temp;
5853
5854       new_phis[0] = epilog_stmt;
5855     }
5856
5857   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5858           phis with new adjusted scalar results, i.e., replace use <s_out0>
5859           with use <s_out4>.
5860
5861      Transform:
5862         loop_exit:
5863           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5864           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5865           v_out2 = reduce <v_out1>
5866           s_out3 = extract_field <v_out2, 0>
5867           s_out4 = adjust_result <s_out3>
5868           use <s_out0>
5869           use <s_out0>
5870
5871      into:
5872
5873         loop_exit:
5874           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5875           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5876           v_out2 = reduce <v_out1>
5877           s_out3 = extract_field <v_out2, 0>
5878           s_out4 = adjust_result <s_out3>
5879           use <s_out4>
5880           use <s_out4> */
5881
5882
5883   /* In SLP reduction chain we reduce vector results into one vector if
5884      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5885      the last stmt in the reduction chain, since we are looking for the loop
5886      exit phi node.  */
5887   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5888     {
5889       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5890       /* Handle reduction patterns.  */
5891       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5892         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5893
5894       scalar_dest = gimple_assign_lhs (dest_stmt);
5895       group_size = 1;
5896     }
5897
5898   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5899      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5900      need to match SCALAR_RESULTS with corresponding statements.  The first
5901      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5902      the first vector stmt, etc.
5903      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5904   if (group_size > new_phis.length ())
5905     {
5906       ratio = group_size / new_phis.length ();
5907       gcc_assert (!(group_size % new_phis.length ()));
5908     }
5909   else
5910     ratio = 1;
5911
5912   for (k = 0; k < group_size; k++)
5913     {
5914       if (k % ratio == 0)
5915         {
5916           epilog_stmt = new_phis[k / ratio];
5917           reduction_phi = reduction_phis[k / ratio];
5918           if (double_reduc)
5919             inner_phi = inner_phis[k / ratio];
5920         }
5921
5922       if (slp_reduc)
5923         {
5924           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5925
5926           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5927           /* SLP statements can't participate in patterns.  */
5928           gcc_assert (!orig_stmt);
5929           scalar_dest = gimple_assign_lhs (current_stmt);
5930         }
5931
5932       phis.create (3);
5933       /* Find the loop-closed-use at the loop exit of the original scalar
5934          result.  (The reduction result is expected to have two immediate uses -
5935          one at the latch block, and one at the loop exit).  */
5936       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5937         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5938             && !is_gimple_debug (USE_STMT (use_p)))
5939           phis.safe_push (USE_STMT (use_p));
5940
5941       /* While we expect to have found an exit_phi because of loop-closed-ssa
5942          form we can end up without one if the scalar cycle is dead.  */
5943
5944       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5945         {
5946           if (outer_loop)
5947             {
5948               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5949               gphi *vect_phi;
5950
5951               /* FORNOW. Currently not supporting the case that an inner-loop
5952                  reduction is not used in the outer-loop (but only outside the
5953                  outer-loop), unless it is double reduction.  */
5954               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5955                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5956                           || double_reduc);
5957
5958               if (double_reduc)
5959                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5960               else
5961                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5962               if (!double_reduc
5963                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5964                       != vect_double_reduction_def)
5965                 continue;
5966
5967               /* Handle double reduction:
5968
5969                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5970                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5971                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5972                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5973
5974                  At that point the regular reduction (stmt2 and stmt3) is
5975                  already vectorized, as well as the exit phi node, stmt4.
5976                  Here we vectorize the phi node of double reduction, stmt1, and
5977                  update all relevant statements.  */
5978
5979               /* Go through all the uses of s2 to find double reduction phi
5980                  node, i.e., stmt1 above.  */
5981               orig_name = PHI_RESULT (exit_phi);
5982               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5983                 {
5984                   stmt_vec_info use_stmt_vinfo;
5985                   stmt_vec_info new_phi_vinfo;
5986                   tree vect_phi_init, preheader_arg, vect_phi_res;
5987                   basic_block bb = gimple_bb (use_stmt);
5988                   gimple *use;
5989
5990                   /* Check that USE_STMT is really double reduction phi
5991                      node.  */
5992                   if (gimple_code (use_stmt) != GIMPLE_PHI
5993                       || gimple_phi_num_args (use_stmt) != 2
5994                       || bb->loop_father != outer_loop)
5995                     continue;
5996                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5997                   if (!use_stmt_vinfo
5998                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5999                           != vect_double_reduction_def)
6000                     continue;
6001
6002                   /* Create vector phi node for double reduction:
6003                      vs1 = phi <vs0, vs2>
6004                      vs1 was created previously in this function by a call to
6005                        vect_get_vec_def_for_operand and is stored in
6006                        vec_initial_def;
6007                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6008                      vs0 is created here.  */
6009
6010                   /* Create vector phi node.  */
6011                   vect_phi = create_phi_node (vec_initial_def, bb);
6012                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
6013                                     loop_vec_info_for_loop (outer_loop));
6014                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6015
6016                   /* Create vs0 - initial def of the double reduction phi.  */
6017                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6018                                              loop_preheader_edge (outer_loop));
6019                   vect_phi_init = get_initial_def_for_reduction
6020                     (stmt, preheader_arg, NULL);
6021
6022                   /* Update phi node arguments with vs0 and vs2.  */
6023                   add_phi_arg (vect_phi, vect_phi_init,
6024                                loop_preheader_edge (outer_loop),
6025                                UNKNOWN_LOCATION);
6026                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6027                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6028                   if (dump_enabled_p ())
6029                     {
6030                       dump_printf_loc (MSG_NOTE, vect_location,
6031                                        "created double reduction phi node: ");
6032                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6033                     }
6034
6035                   vect_phi_res = PHI_RESULT (vect_phi);
6036
6037                   /* Replace the use, i.e., set the correct vs1 in the regular
6038                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
6039                      loop is redundant.  */
6040                   use = reduction_phi;
6041                   for (j = 0; j < ncopies; j++)
6042                     {
6043                       edge pr_edge = loop_preheader_edge (loop);
6044                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6045                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6046                     }
6047                 }
6048             }
6049         }
6050
6051       phis.release ();
6052       if (nested_in_vect_loop)
6053         {
6054           if (double_reduc)
6055             loop = outer_loop;
6056           else
6057             continue;
6058         }
6059
6060       phis.create (3);
6061       /* Find the loop-closed-use at the loop exit of the original scalar
6062          result.  (The reduction result is expected to have two immediate uses,
6063          one at the latch block, and one at the loop exit).  For double
6064          reductions we are looking for exit phis of the outer loop.  */
6065       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6066         {
6067           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6068             {
6069               if (!is_gimple_debug (USE_STMT (use_p)))
6070                 phis.safe_push (USE_STMT (use_p));
6071             }
6072           else
6073             {
6074               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6075                 {
6076                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6077
6078                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6079                     {
6080                       if (!flow_bb_inside_loop_p (loop,
6081                                              gimple_bb (USE_STMT (phi_use_p)))
6082                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6083                         phis.safe_push (USE_STMT (phi_use_p));
6084                     }
6085                 }
6086             }
6087         }
6088
6089       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6090         {
6091           /* Replace the uses:  */
6092           orig_name = PHI_RESULT (exit_phi);
6093           scalar_result = scalar_results[k];
6094           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6095             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6096               SET_USE (use_p, scalar_result);
6097         }
6098
6099       phis.release ();
6100     }
6101 }
6102
6103 /* Return a vector of type VECTYPE that is equal to the vector select
6104    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6105    before GSI.  */
6106
6107 static tree
6108 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6109                      tree vec, tree identity)
6110 {
6111   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6112   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6113                                           mask, vec, identity);
6114   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6115   return cond;
6116 }
6117
6118 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6119    order, starting with LHS.  Insert the extraction statements before GSI and
6120    associate the new scalar SSA names with variable SCALAR_DEST.
6121    Return the SSA name for the result.  */
6122
6123 static tree
6124 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6125                        tree_code code, tree lhs, tree vector_rhs)
6126 {
6127   tree vectype = TREE_TYPE (vector_rhs);
6128   tree scalar_type = TREE_TYPE (vectype);
6129   tree bitsize = TYPE_SIZE (scalar_type);
6130   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6131   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6132
6133   for (unsigned HOST_WIDE_INT bit_offset = 0;
6134        bit_offset < vec_size_in_bits;
6135        bit_offset += element_bitsize)
6136     {
6137       tree bitpos = bitsize_int (bit_offset);
6138       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6139                          bitsize, bitpos);
6140
6141       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6142       rhs = make_ssa_name (scalar_dest, stmt);
6143       gimple_assign_set_lhs (stmt, rhs);
6144       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6145
6146       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6147       tree new_name = make_ssa_name (scalar_dest, stmt);
6148       gimple_assign_set_lhs (stmt, new_name);
6149       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6150       lhs = new_name;
6151     }
6152   return lhs;
6153 }
6154
6155 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
6156    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6157    statement.  CODE is the operation performed by STMT and OPS are
6158    its scalar operands.  REDUC_INDEX is the index of the operand in
6159    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6160    implements in-order reduction, or IFN_LAST if we should open-code it.
6161    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6162    that should be used to control the operation in a fully-masked loop.  */
6163
6164 static bool
6165 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6166                                gimple **vec_stmt, slp_tree slp_node,
6167                                gimple *reduc_def_stmt,
6168                                tree_code code, internal_fn reduc_fn,
6169                                tree ops[3], tree vectype_in,
6170                                int reduc_index, vec_loop_masks *masks)
6171 {
6172   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6173   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6174   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6175   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6176   gimple *new_stmt = NULL;
6177
6178   int ncopies;
6179   if (slp_node)
6180     ncopies = 1;
6181   else
6182     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6183
6184   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6185   gcc_assert (ncopies == 1);
6186   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6187   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6188   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6189               == FOLD_LEFT_REDUCTION);
6190
6191   if (slp_node)
6192     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6193                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6194
6195   tree op0 = ops[1 - reduc_index];
6196
6197   int group_size = 1;
6198   gimple *scalar_dest_def;
6199   auto_vec<tree> vec_oprnds0;
6200   if (slp_node)
6201     {
6202       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6203       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6204       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6205     }
6206   else
6207     {
6208       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6209       vec_oprnds0.create (1);
6210       vec_oprnds0.quick_push (loop_vec_def0);
6211       scalar_dest_def = stmt;
6212     }
6213
6214   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6215   tree scalar_type = TREE_TYPE (scalar_dest);
6216   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6217
6218   int vec_num = vec_oprnds0.length ();
6219   gcc_assert (vec_num == 1 || slp_node);
6220   tree vec_elem_type = TREE_TYPE (vectype_out);
6221   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6222
6223   tree vector_identity = NULL_TREE;
6224   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6225     vector_identity = build_zero_cst (vectype_out);
6226
6227   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6228   int i;
6229   tree def0;
6230   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6231     {
6232       tree mask = NULL_TREE;
6233       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6234         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6235
6236       /* Handle MINUS by adding the negative.  */
6237       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6238         {
6239           tree negated = make_ssa_name (vectype_out);
6240           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6241           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6242           def0 = negated;
6243         }
6244
6245       if (mask)
6246         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6247                                     vector_identity);
6248
6249       /* On the first iteration the input is simply the scalar phi
6250          result, and for subsequent iterations it is the output of
6251          the preceding operation.  */
6252       if (reduc_fn != IFN_LAST)
6253         {
6254           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6255           /* For chained SLP reductions the output of the previous reduction
6256              operation serves as the input of the next. For the final statement
6257              the output cannot be a temporary - we reuse the original
6258              scalar destination of the last statement.  */
6259           if (i != vec_num - 1)
6260             {
6261               gimple_set_lhs (new_stmt, scalar_dest_var);
6262               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6263               gimple_set_lhs (new_stmt, reduc_var);
6264             }
6265         }
6266       else
6267         {
6268           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6269                                              reduc_var, def0);
6270           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6271           /* Remove the statement, so that we can use the same code paths
6272              as for statements that we've just created.  */
6273           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6274           gsi_remove (&tmp_gsi, false);
6275         }
6276
6277       if (i == vec_num - 1)
6278         {
6279           gimple_set_lhs (new_stmt, scalar_dest);
6280           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6281         }
6282       else
6283         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6284
6285       if (slp_node)
6286         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6287     }
6288
6289   if (!slp_node)
6290     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6291
6292   return true;
6293 }
6294
6295 /* Function is_nonwrapping_integer_induction.
6296
6297    Check if STMT (which is part of loop LOOP) both increments and
6298    does not cause overflow.  */
6299
6300 static bool
6301 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6302 {
6303   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6304   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6305   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6306   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6307   widest_int ni, max_loop_value, lhs_max;
6308   bool overflow = false;
6309
6310   /* Make sure the loop is integer based.  */
6311   if (TREE_CODE (base) != INTEGER_CST
6312       || TREE_CODE (step) != INTEGER_CST)
6313     return false;
6314
6315   /* Check that the max size of the loop will not wrap.  */
6316
6317   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6318     return true;
6319
6320   if (! max_stmt_executions (loop, &ni))
6321     return false;
6322
6323   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6324                             &overflow);
6325   if (overflow)
6326     return false;
6327
6328   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6329                             TYPE_SIGN (lhs_type), &overflow);
6330   if (overflow)
6331     return false;
6332
6333   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6334           <= TYPE_PRECISION (lhs_type));
6335 }
6336
6337 /* Function vectorizable_reduction.
6338
6339    Check if STMT performs a reduction operation that can be vectorized.
6340    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6341    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6342    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6343
6344    This function also handles reduction idioms (patterns) that have been
6345    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6346    of this form:
6347      X = pattern_expr (arg0, arg1, ..., X)
6348    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6349    sequence that had been detected and replaced by the pattern-stmt (STMT).
6350
6351    This function also handles reduction of condition expressions, for example:
6352      for (int i = 0; i < N; i++)
6353        if (a[i] < value)
6354          last = a[i];
6355    This is handled by vectorising the loop and creating an additional vector
6356    containing the loop indexes for which "a[i] < value" was true.  In the
6357    function epilogue this is reduced to a single max value and then used to
6358    index into the vector of results.
6359
6360    In some cases of reduction patterns, the type of the reduction variable X is
6361    different than the type of the other arguments of STMT.
6362    In such cases, the vectype that is used when transforming STMT into a vector
6363    stmt is different than the vectype that is used to determine the
6364    vectorization factor, because it consists of a different number of elements
6365    than the actual number of elements that are being operated upon in parallel.
6366
6367    For example, consider an accumulation of shorts into an int accumulator.
6368    On some targets it's possible to vectorize this pattern operating on 8
6369    shorts at a time (hence, the vectype for purposes of determining the
6370    vectorization factor should be V8HI); on the other hand, the vectype that
6371    is used to create the vector form is actually V4SI (the type of the result).
6372
6373    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6374    indicates what is the actual level of parallelism (V8HI in the example), so
6375    that the right vectorization factor would be derived.  This vectype
6376    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6377    be used to create the vectorized stmt.  The right vectype for the vectorized
6378    stmt is obtained from the type of the result X:
6379         get_vectype_for_scalar_type (TREE_TYPE (X))
6380
6381    This means that, contrary to "regular" reductions (or "regular" stmts in
6382    general), the following equation:
6383       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6384    does *NOT* necessarily hold for reduction patterns.  */
6385
6386 bool
6387 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6388                         gimple **vec_stmt, slp_tree slp_node,
6389                         slp_instance slp_node_instance)
6390 {
6391   tree vec_dest;
6392   tree scalar_dest;
6393   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6394   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6395   tree vectype_in = NULL_TREE;
6396   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6397   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6398   enum tree_code code, orig_code;
6399   internal_fn reduc_fn;
6400   machine_mode vec_mode;
6401   int op_type;
6402   optab optab;
6403   tree new_temp = NULL_TREE;
6404   gimple *def_stmt;
6405   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6406   gimple *cond_reduc_def_stmt = NULL;
6407   enum tree_code cond_reduc_op_code = ERROR_MARK;
6408   tree scalar_type;
6409   bool is_simple_use;
6410   gimple *orig_stmt;
6411   stmt_vec_info orig_stmt_info = NULL;
6412   int i;
6413   int ncopies;
6414   int epilog_copies;
6415   stmt_vec_info prev_stmt_info, prev_phi_info;
6416   bool single_defuse_cycle = false;
6417   gimple *new_stmt = NULL;
6418   int j;
6419   tree ops[3];
6420   enum vect_def_type dts[3];
6421   bool nested_cycle = false, found_nested_cycle_def = false;
6422   bool double_reduc = false;
6423   basic_block def_bb;
6424   struct loop * def_stmt_loop, *outer_loop = NULL;
6425   tree def_arg;
6426   gimple *def_arg_stmt;
6427   auto_vec<tree> vec_oprnds0;
6428   auto_vec<tree> vec_oprnds1;
6429   auto_vec<tree> vec_oprnds2;
6430   auto_vec<tree> vect_defs;
6431   auto_vec<gimple *> phis;
6432   int vec_num;
6433   tree def0, tem;
6434   bool first_p = true;
6435   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6436   tree cond_reduc_val = NULL_TREE;
6437
6438   /* Make sure it was already recognized as a reduction computation.  */
6439   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6440       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6441     return false;
6442
6443   if (nested_in_vect_loop_p (loop, stmt))
6444     {
6445       outer_loop = loop;
6446       loop = loop->inner;
6447       nested_cycle = true;
6448     }
6449
6450   /* In case of reduction chain we switch to the first stmt in the chain, but
6451      we don't update STMT_INFO, since only the last stmt is marked as reduction
6452      and has reduction properties.  */
6453   if (GROUP_FIRST_ELEMENT (stmt_info)
6454       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6455     {
6456       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6457       first_p = false;
6458     }
6459
6460   if (gimple_code (stmt) == GIMPLE_PHI)
6461     {
6462       /* Analysis is fully done on the reduction stmt invocation.  */
6463       if (! vec_stmt)
6464         {
6465           if (slp_node)
6466             slp_node_instance->reduc_phis = slp_node;
6467
6468           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6469           return true;
6470         }
6471
6472       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6473         /* Leave the scalar phi in place.  Note that checking
6474            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6475            for reductions involving a single statement.  */
6476         return true;
6477
6478       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6479       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6480         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6481
6482       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6483           == EXTRACT_LAST_REDUCTION)
6484         /* Leave the scalar phi in place.  */
6485         return true;
6486
6487       gcc_assert (is_gimple_assign (reduc_stmt));
6488       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6489         {
6490           tree op = gimple_op (reduc_stmt, k);
6491           if (op == gimple_phi_result (stmt))
6492             continue;
6493           if (k == 1
6494               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6495             continue;
6496           if (!vectype_in
6497               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6498                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6499             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6500           break;
6501         }
6502       gcc_assert (vectype_in);
6503
6504       if (slp_node)
6505         ncopies = 1;
6506       else
6507         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6508
6509       use_operand_p use_p;
6510       gimple *use_stmt;
6511       if (ncopies > 1
6512           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6513               <= vect_used_only_live)
6514           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6515           && (use_stmt == reduc_stmt
6516               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6517                   == reduc_stmt)))
6518         single_defuse_cycle = true;
6519
6520       /* Create the destination vector  */
6521       scalar_dest = gimple_assign_lhs (reduc_stmt);
6522       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6523
6524       if (slp_node)
6525         /* The size vect_schedule_slp_instance computes is off for us.  */
6526         vec_num = vect_get_num_vectors
6527           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6528            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6529            vectype_in);
6530       else
6531         vec_num = 1;
6532
6533       /* Generate the reduction PHIs upfront.  */
6534       prev_phi_info = NULL;
6535       for (j = 0; j < ncopies; j++)
6536         {
6537           if (j == 0 || !single_defuse_cycle)
6538             {
6539               for (i = 0; i < vec_num; i++)
6540                 {
6541                   /* Create the reduction-phi that defines the reduction
6542                      operand.  */
6543                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6544                   set_vinfo_for_stmt (new_phi,
6545                                       new_stmt_vec_info (new_phi, loop_vinfo));
6546
6547                   if (slp_node)
6548                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6549                   else
6550                     {
6551                       if (j == 0)
6552                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6553                       else
6554                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6555                       prev_phi_info = vinfo_for_stmt (new_phi);
6556                     }
6557                 }
6558             }
6559         }
6560
6561       return true;
6562     }
6563
6564   /* 1. Is vectorizable reduction?  */
6565   /* Not supportable if the reduction variable is used in the loop, unless
6566      it's a reduction chain.  */
6567   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6568       && !GROUP_FIRST_ELEMENT (stmt_info))
6569     return false;
6570
6571   /* Reductions that are not used even in an enclosing outer-loop,
6572      are expected to be "live" (used out of the loop).  */
6573   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6574       && !STMT_VINFO_LIVE_P (stmt_info))
6575     return false;
6576
6577   /* 2. Has this been recognized as a reduction pattern?
6578
6579      Check if STMT represents a pattern that has been recognized
6580      in earlier analysis stages.  For stmts that represent a pattern,
6581      the STMT_VINFO_RELATED_STMT field records the last stmt in
6582      the original sequence that constitutes the pattern.  */
6583
6584   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6585   if (orig_stmt)
6586     {
6587       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6588       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6589       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6590     }
6591
6592   /* 3. Check the operands of the operation.  The first operands are defined
6593         inside the loop body. The last operand is the reduction variable,
6594         which is defined by the loop-header-phi.  */
6595
6596   gcc_assert (is_gimple_assign (stmt));
6597
6598   /* Flatten RHS.  */
6599   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6600     {
6601     case GIMPLE_BINARY_RHS:
6602       code = gimple_assign_rhs_code (stmt);
6603       op_type = TREE_CODE_LENGTH (code);
6604       gcc_assert (op_type == binary_op);
6605       ops[0] = gimple_assign_rhs1 (stmt);
6606       ops[1] = gimple_assign_rhs2 (stmt);
6607       break;
6608
6609     case GIMPLE_TERNARY_RHS:
6610       code = gimple_assign_rhs_code (stmt);
6611       op_type = TREE_CODE_LENGTH (code);
6612       gcc_assert (op_type == ternary_op);
6613       ops[0] = gimple_assign_rhs1 (stmt);
6614       ops[1] = gimple_assign_rhs2 (stmt);
6615       ops[2] = gimple_assign_rhs3 (stmt);
6616       break;
6617
6618     case GIMPLE_UNARY_RHS:
6619       return false;
6620
6621     default:
6622       gcc_unreachable ();
6623     }
6624
6625   if (code == COND_EXPR && slp_node)
6626     return false;
6627
6628   scalar_dest = gimple_assign_lhs (stmt);
6629   scalar_type = TREE_TYPE (scalar_dest);
6630   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6631       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6632     return false;
6633
6634   /* Do not try to vectorize bit-precision reductions.  */
6635   if (!type_has_mode_precision_p (scalar_type))
6636     return false;
6637
6638   /* All uses but the last are expected to be defined in the loop.
6639      The last use is the reduction variable.  In case of nested cycle this
6640      assumption is not true: we use reduc_index to record the index of the
6641      reduction variable.  */
6642   gimple *reduc_def_stmt = NULL;
6643   int reduc_index = -1;
6644   for (i = 0; i < op_type; i++)
6645     {
6646       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6647       if (i == 0 && code == COND_EXPR)
6648         continue;
6649
6650       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6651                                           &def_stmt, &dts[i], &tem);
6652       dt = dts[i];
6653       gcc_assert (is_simple_use);
6654       if (dt == vect_reduction_def)
6655         {
6656           reduc_def_stmt = def_stmt;
6657           reduc_index = i;
6658           continue;
6659         }
6660       else if (tem)
6661         {
6662           /* To properly compute ncopies we are interested in the widest
6663              input type in case we're looking at a widening accumulation.  */
6664           if (!vectype_in
6665               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6666                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6667             vectype_in = tem;
6668         }
6669
6670       if (dt != vect_internal_def
6671           && dt != vect_external_def
6672           && dt != vect_constant_def
6673           && dt != vect_induction_def
6674           && !(dt == vect_nested_cycle && nested_cycle))
6675         return false;
6676
6677       if (dt == vect_nested_cycle)
6678         {
6679           found_nested_cycle_def = true;
6680           reduc_def_stmt = def_stmt;
6681           reduc_index = i;
6682         }
6683
6684       if (i == 1 && code == COND_EXPR)
6685         {
6686           /* Record how value of COND_EXPR is defined.  */
6687           if (dt == vect_constant_def)
6688             {
6689               cond_reduc_dt = dt;
6690               cond_reduc_val = ops[i];
6691             }
6692           if (dt == vect_induction_def
6693               && def_stmt != NULL
6694               && is_nonwrapping_integer_induction (def_stmt, loop))
6695             {
6696               cond_reduc_dt = dt;
6697               cond_reduc_def_stmt = def_stmt;
6698             }
6699         }
6700     }
6701
6702   if (!vectype_in)
6703     vectype_in = vectype_out;
6704
6705   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6706      directy used in stmt.  */
6707   if (reduc_index == -1)
6708     {
6709       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6710         {
6711           if (dump_enabled_p ())
6712             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6713                              "in-order reduction chain without SLP.\n");
6714           return false;
6715         }
6716
6717       if (orig_stmt)
6718         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6719       else
6720         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6721     }
6722
6723   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6724     return false;
6725
6726   if (!(reduc_index == -1
6727         || dts[reduc_index] == vect_reduction_def
6728         || dts[reduc_index] == vect_nested_cycle
6729         || ((dts[reduc_index] == vect_internal_def
6730              || dts[reduc_index] == vect_external_def
6731              || dts[reduc_index] == vect_constant_def
6732              || dts[reduc_index] == vect_induction_def)
6733             && nested_cycle && found_nested_cycle_def)))
6734     {
6735       /* For pattern recognized stmts, orig_stmt might be a reduction,
6736          but some helper statements for the pattern might not, or
6737          might be COND_EXPRs with reduction uses in the condition.  */
6738       gcc_assert (orig_stmt);
6739       return false;
6740     }
6741
6742   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6743   enum vect_reduction_type v_reduc_type
6744     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6745   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6746
6747   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6748   /* If we have a condition reduction, see if we can simplify it further.  */
6749   if (v_reduc_type == COND_REDUCTION)
6750     {
6751       /* Loop peeling modifies initial value of reduction PHI, which
6752          makes the reduction stmt to be transformed different to the
6753          original stmt analyzed.  We need to record reduction code for
6754          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6755          it can be used directly at transform stage.  */
6756       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6757           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6758         {
6759           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6760           gcc_assert (cond_reduc_dt == vect_constant_def);
6761           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6762         }
6763       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6764                                                vectype_in, OPTIMIZE_FOR_SPEED))
6765         {
6766           if (dump_enabled_p ())
6767             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6768                              "optimizing condition reduction with"
6769                              " FOLD_EXTRACT_LAST.\n");
6770           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6771         }
6772       else if (cond_reduc_dt == vect_induction_def)
6773         {
6774           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6775           tree base
6776             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6777           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6778
6779           gcc_assert (TREE_CODE (base) == INTEGER_CST
6780                       && TREE_CODE (step) == INTEGER_CST);
6781           cond_reduc_val = NULL_TREE;
6782           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6783              above base; punt if base is the minimum value of the type for
6784              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6785           if (tree_int_cst_sgn (step) == -1)
6786             {
6787               cond_reduc_op_code = MIN_EXPR;
6788               if (tree_int_cst_sgn (base) == -1)
6789                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6790               else if (tree_int_cst_lt (base,
6791                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6792                 cond_reduc_val
6793                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6794             }
6795           else
6796             {
6797               cond_reduc_op_code = MAX_EXPR;
6798               if (tree_int_cst_sgn (base) == 1)
6799                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6800               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6801                                         base))
6802                 cond_reduc_val
6803                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6804             }
6805           if (cond_reduc_val)
6806             {
6807               if (dump_enabled_p ())
6808                 dump_printf_loc (MSG_NOTE, vect_location,
6809                                  "condition expression based on "
6810                                  "integer induction.\n");
6811               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6812                 = INTEGER_INDUC_COND_REDUCTION;
6813             }
6814         }
6815       else if (cond_reduc_dt == vect_constant_def)
6816         {
6817           enum vect_def_type cond_initial_dt;
6818           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6819           tree cond_initial_val
6820             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6821
6822           gcc_assert (cond_reduc_val != NULL_TREE);
6823           vect_is_simple_use (cond_initial_val, loop_vinfo,
6824                               &def_stmt, &cond_initial_dt);
6825           if (cond_initial_dt == vect_constant_def
6826               && types_compatible_p (TREE_TYPE (cond_initial_val),
6827                                      TREE_TYPE (cond_reduc_val)))
6828             {
6829               tree e = fold_binary (LE_EXPR, boolean_type_node,
6830                                     cond_initial_val, cond_reduc_val);
6831               if (e && (integer_onep (e) || integer_zerop (e)))
6832                 {
6833                   if (dump_enabled_p ())
6834                     dump_printf_loc (MSG_NOTE, vect_location,
6835                                      "condition expression based on "
6836                                      "compile time constant.\n");
6837                   /* Record reduction code at analysis stage.  */
6838                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6839                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6840                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6841                     = CONST_COND_REDUCTION;
6842                 }
6843             }
6844         }
6845     }
6846
6847   if (orig_stmt)
6848     gcc_assert (tmp == orig_stmt
6849                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6850   else
6851     /* We changed STMT to be the first stmt in reduction chain, hence we
6852        check that in this case the first element in the chain is STMT.  */
6853     gcc_assert (stmt == tmp
6854                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6855
6856   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6857     return false;
6858
6859   if (slp_node)
6860     ncopies = 1;
6861   else
6862     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6863
6864   gcc_assert (ncopies >= 1);
6865
6866   vec_mode = TYPE_MODE (vectype_in);
6867   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6868
6869   if (code == COND_EXPR)
6870     {
6871       /* Only call during the analysis stage, otherwise we'll lose
6872          STMT_VINFO_TYPE.  */
6873       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6874                                                 ops[reduc_index], 0, NULL))
6875         {
6876           if (dump_enabled_p ())
6877             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6878                              "unsupported condition in reduction\n");
6879           return false;
6880         }
6881     }
6882   else
6883     {
6884       /* 4. Supportable by target?  */
6885
6886       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6887           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6888         {
6889           /* Shifts and rotates are only supported by vectorizable_shifts,
6890              not vectorizable_reduction.  */
6891           if (dump_enabled_p ())
6892             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6893                              "unsupported shift or rotation.\n");
6894           return false;
6895         }
6896
6897       /* 4.1. check support for the operation in the loop  */
6898       optab = optab_for_tree_code (code, vectype_in, optab_default);
6899       if (!optab)
6900         {
6901           if (dump_enabled_p ())
6902             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6903                              "no optab.\n");
6904
6905           return false;
6906         }
6907
6908       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6909         {
6910           if (dump_enabled_p ())
6911             dump_printf (MSG_NOTE, "op not supported by target.\n");
6912
6913           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6914               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6915             return false;
6916
6917           if (dump_enabled_p ())
6918             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6919         }
6920
6921       /* Worthwhile without SIMD support?  */
6922       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6923           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6924         {
6925           if (dump_enabled_p ())
6926             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6927                              "not worthwhile without SIMD support.\n");
6928
6929           return false;
6930         }
6931     }
6932
6933   /* 4.2. Check support for the epilog operation.
6934
6935           If STMT represents a reduction pattern, then the type of the
6936           reduction variable may be different than the type of the rest
6937           of the arguments.  For example, consider the case of accumulation
6938           of shorts into an int accumulator; The original code:
6939                         S1: int_a = (int) short_a;
6940           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6941
6942           was replaced with:
6943                         STMT: int_acc = widen_sum <short_a, int_acc>
6944
6945           This means that:
6946           1. The tree-code that is used to create the vector operation in the
6947              epilog code (that reduces the partial results) is not the
6948              tree-code of STMT, but is rather the tree-code of the original
6949              stmt from the pattern that STMT is replacing.  I.e, in the example
6950              above we want to use 'widen_sum' in the loop, but 'plus' in the
6951              epilog.
6952           2. The type (mode) we use to check available target support
6953              for the vector operation to be created in the *epilog*, is
6954              determined by the type of the reduction variable (in the example
6955              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6956              However the type (mode) we use to check available target support
6957              for the vector operation to be created *inside the loop*, is
6958              determined by the type of the other arguments to STMT (in the
6959              example we'd check this: optab_handler (widen_sum_optab,
6960              vect_short_mode)).
6961
6962           This is contrary to "regular" reductions, in which the types of all
6963           the arguments are the same as the type of the reduction variable.
6964           For "regular" reductions we can therefore use the same vector type
6965           (and also the same tree-code) when generating the epilog code and
6966           when generating the code inside the loop.  */
6967
6968   vect_reduction_type reduction_type
6969     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6970   if (orig_stmt
6971       && (reduction_type == TREE_CODE_REDUCTION
6972           || reduction_type == FOLD_LEFT_REDUCTION))
6973     {
6974       /* This is a reduction pattern: get the vectype from the type of the
6975          reduction variable, and get the tree-code from orig_stmt.  */
6976       orig_code = gimple_assign_rhs_code (orig_stmt);
6977       gcc_assert (vectype_out);
6978       vec_mode = TYPE_MODE (vectype_out);
6979     }
6980   else
6981     {
6982       /* Regular reduction: use the same vectype and tree-code as used for
6983          the vector code inside the loop can be used for the epilog code. */
6984       orig_code = code;
6985
6986       if (code == MINUS_EXPR)
6987         orig_code = PLUS_EXPR;
6988
6989       /* For simple condition reductions, replace with the actual expression
6990          we want to base our reduction around.  */
6991       if (reduction_type == CONST_COND_REDUCTION)
6992         {
6993           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6994           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6995         }
6996       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6997         orig_code = cond_reduc_op_code;
6998     }
6999
7000   if (nested_cycle)
7001     {
7002       def_bb = gimple_bb (reduc_def_stmt);
7003       def_stmt_loop = def_bb->loop_father;
7004       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7005                                        loop_preheader_edge (def_stmt_loop));
7006       if (TREE_CODE (def_arg) == SSA_NAME
7007           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7008           && gimple_code (def_arg_stmt) == GIMPLE_PHI
7009           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7010           && vinfo_for_stmt (def_arg_stmt)
7011           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7012               == vect_double_reduction_def)
7013         double_reduc = true;
7014     }
7015
7016   reduc_fn = IFN_LAST;
7017
7018   if (reduction_type == TREE_CODE_REDUCTION
7019       || reduction_type == FOLD_LEFT_REDUCTION
7020       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7021       || reduction_type == CONST_COND_REDUCTION)
7022     {
7023       if (reduction_type == FOLD_LEFT_REDUCTION
7024           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7025           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7026         {
7027           if (reduc_fn != IFN_LAST
7028               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7029                                                   OPTIMIZE_FOR_SPEED))
7030             {
7031               if (dump_enabled_p ())
7032                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7033                                  "reduc op not supported by target.\n");
7034
7035               reduc_fn = IFN_LAST;
7036             }
7037         }
7038       else
7039         {
7040           if (!nested_cycle || double_reduc)
7041             {
7042               if (dump_enabled_p ())
7043                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7044                                  "no reduc code for scalar code.\n");
7045
7046               return false;
7047             }
7048         }
7049     }
7050   else if (reduction_type == COND_REDUCTION)
7051     {
7052       int scalar_precision
7053         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7054       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7055       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7056                                                 nunits_out);
7057
7058       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7059                                           OPTIMIZE_FOR_SPEED))
7060         reduc_fn = IFN_REDUC_MAX;
7061     }
7062
7063   if (reduction_type != EXTRACT_LAST_REDUCTION
7064       && reduc_fn == IFN_LAST
7065       && !nunits_out.is_constant ())
7066     {
7067       if (dump_enabled_p ())
7068         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7069                          "missing target support for reduction on"
7070                          " variable-length vectors.\n");
7071       return false;
7072     }
7073
7074   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7075       && ncopies > 1)
7076     {
7077       if (dump_enabled_p ())
7078         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7079                          "multiple types in double reduction or condition "
7080                          "reduction.\n");
7081       return false;
7082     }
7083
7084   /* For SLP reductions, see if there is a neutral value we can use.  */
7085   tree neutral_op = NULL_TREE;
7086   if (slp_node)
7087     neutral_op
7088       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7089                                       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7090
7091   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7092     {
7093       /* We can't support in-order reductions of code such as this:
7094
7095            for (int i = 0; i < n1; ++i)
7096              for (int j = 0; j < n2; ++j)
7097                l += a[j];
7098
7099          since GCC effectively transforms the loop when vectorizing:
7100
7101            for (int i = 0; i < n1 / VF; ++i)
7102              for (int j = 0; j < n2; ++j)
7103                for (int k = 0; k < VF; ++k)
7104                  l += a[j];
7105
7106          which is a reassociation of the original operation.  */
7107       if (dump_enabled_p ())
7108         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7109                          "in-order double reduction not supported.\n");
7110
7111       return false;
7112     }
7113
7114   if (reduction_type == FOLD_LEFT_REDUCTION
7115       && slp_node
7116       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7117     {
7118       /* We cannot use in-order reductions in this case because there is
7119          an implicit reassociation of the operations involved.  */
7120       if (dump_enabled_p ())
7121         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7122                          "in-order unchained SLP reductions not supported.\n");
7123       return false;
7124     }
7125
7126   /* For double reductions, and for SLP reductions with a neutral value,
7127      we construct a variable-length initial vector by loading a vector
7128      full of the neutral value and then shift-and-inserting the start
7129      values into the low-numbered elements.  */
7130   if ((double_reduc || neutral_op)
7131       && !nunits_out.is_constant ()
7132       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7133                                           vectype_out, OPTIMIZE_FOR_SPEED))
7134     {
7135       if (dump_enabled_p ())
7136         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7137                          "reduction on variable-length vectors requires"
7138                          " target support for a vector-shift-and-insert"
7139                          " operation.\n");
7140       return false;
7141     }
7142
7143   /* Check extra constraints for variable-length unchained SLP reductions.  */
7144   if (STMT_SLP_TYPE (stmt_info)
7145       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7146       && !nunits_out.is_constant ())
7147     {
7148       /* We checked above that we could build the initial vector when
7149          there's a neutral element value.  Check here for the case in
7150          which each SLP statement has its own initial value and in which
7151          that value needs to be repeated for every instance of the
7152          statement within the initial vector.  */
7153       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7154       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7155       if (!neutral_op
7156           && !can_duplicate_and_interleave_p (group_size, elt_mode))
7157         {
7158           if (dump_enabled_p ())
7159             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7160                              "unsupported form of SLP reduction for"
7161                              " variable-length vectors: cannot build"
7162                              " initial vector.\n");
7163           return false;
7164         }
7165       /* The epilogue code relies on the number of elements being a multiple
7166          of the group size.  The duplicate-and-interleave approach to setting
7167          up the the initial vector does too.  */
7168       if (!multiple_p (nunits_out, group_size))
7169         {
7170           if (dump_enabled_p ())
7171             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7172                              "unsupported form of SLP reduction for"
7173                              " variable-length vectors: the vector size"
7174                              " is not a multiple of the number of results.\n");
7175           return false;
7176         }
7177     }
7178
7179   /* In case of widenning multiplication by a constant, we update the type
7180      of the constant to be the type of the other operand.  We check that the
7181      constant fits the type in the pattern recognition pass.  */
7182   if (code == DOT_PROD_EXPR
7183       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7184     {
7185       if (TREE_CODE (ops[0]) == INTEGER_CST)
7186         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7187       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7188         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7189       else
7190         {
7191           if (dump_enabled_p ())
7192             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7193                              "invalid types in dot-prod\n");
7194
7195           return false;
7196         }
7197     }
7198
7199   if (reduction_type == COND_REDUCTION)
7200     {
7201       widest_int ni;
7202
7203       if (! max_loop_iterations (loop, &ni))
7204         {
7205           if (dump_enabled_p ())
7206             dump_printf_loc (MSG_NOTE, vect_location,
7207                              "loop count not known, cannot create cond "
7208                              "reduction.\n");
7209           return false;
7210         }
7211       /* Convert backedges to iterations.  */
7212       ni += 1;
7213
7214       /* The additional index will be the same type as the condition.  Check
7215          that the loop can fit into this less one (because we'll use up the
7216          zero slot for when there are no matches).  */
7217       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7218       if (wi::geu_p (ni, wi::to_widest (max_index)))
7219         {
7220           if (dump_enabled_p ())
7221             dump_printf_loc (MSG_NOTE, vect_location,
7222                              "loop size is greater than data size.\n");
7223           return false;
7224         }
7225     }
7226
7227   /* In case the vectorization factor (VF) is bigger than the number
7228      of elements that we can fit in a vectype (nunits), we have to generate
7229      more than one vector stmt - i.e - we need to "unroll" the
7230      vector stmt by a factor VF/nunits.  For more details see documentation
7231      in vectorizable_operation.  */
7232
7233   /* If the reduction is used in an outer loop we need to generate
7234      VF intermediate results, like so (e.g. for ncopies=2):
7235         r0 = phi (init, r0)
7236         r1 = phi (init, r1)
7237         r0 = x0 + r0;
7238         r1 = x1 + r1;
7239     (i.e. we generate VF results in 2 registers).
7240     In this case we have a separate def-use cycle for each copy, and therefore
7241     for each copy we get the vector def for the reduction variable from the
7242     respective phi node created for this copy.
7243
7244     Otherwise (the reduction is unused in the loop nest), we can combine
7245     together intermediate results, like so (e.g. for ncopies=2):
7246         r = phi (init, r)
7247         r = x0 + r;
7248         r = x1 + r;
7249    (i.e. we generate VF/2 results in a single register).
7250    In this case for each copy we get the vector def for the reduction variable
7251    from the vectorized reduction operation generated in the previous iteration.
7252
7253    This only works when we see both the reduction PHI and its only consumer
7254    in vectorizable_reduction and there are no intermediate stmts
7255    participating.  */
7256   use_operand_p use_p;
7257   gimple *use_stmt;
7258   if (ncopies > 1
7259       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7260       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7261       && (use_stmt == stmt
7262           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7263     {
7264       single_defuse_cycle = true;
7265       epilog_copies = 1;
7266     }
7267   else
7268     epilog_copies = ncopies;
7269
7270   /* If the reduction stmt is one of the patterns that have lane
7271      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7272   if ((ncopies > 1
7273        && ! single_defuse_cycle)
7274       && (code == DOT_PROD_EXPR
7275           || code == WIDEN_SUM_EXPR
7276           || code == SAD_EXPR))
7277     {
7278       if (dump_enabled_p ())
7279         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7280                          "multi def-use cycle not possible for lane-reducing "
7281                          "reduction operation\n");
7282       return false;
7283     }
7284
7285   if (slp_node)
7286     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7287   else
7288     vec_num = 1;
7289
7290   internal_fn cond_fn = get_conditional_internal_fn (code);
7291   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7292
7293   if (!vec_stmt) /* transformation not required.  */
7294     {
7295       if (first_p)
7296         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7297       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7298         {
7299           if (reduction_type != FOLD_LEFT_REDUCTION
7300               && (cond_fn == IFN_LAST
7301                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7302                                                       OPTIMIZE_FOR_SPEED)))
7303             {
7304               if (dump_enabled_p ())
7305                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7306                                  "can't use a fully-masked loop because no"
7307                                  " conditional operation is available.\n");
7308               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7309             }
7310           else if (reduc_index == -1)
7311             {
7312               if (dump_enabled_p ())
7313                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7314                                  "can't use a fully-masked loop for chained"
7315                                  " reductions.\n");
7316               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7317             }
7318           else
7319             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7320                                    vectype_in);
7321         }
7322       if (dump_enabled_p ()
7323           && reduction_type == FOLD_LEFT_REDUCTION)
7324         dump_printf_loc (MSG_NOTE, vect_location,
7325                          "using an in-order (fold-left) reduction.\n");
7326       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7327       return true;
7328     }
7329
7330   /* Transform.  */
7331
7332   if (dump_enabled_p ())
7333     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7334
7335   /* FORNOW: Multiple types are not supported for condition.  */
7336   if (code == COND_EXPR)
7337     gcc_assert (ncopies == 1);
7338
7339   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7340
7341   if (reduction_type == FOLD_LEFT_REDUCTION)
7342     return vectorize_fold_left_reduction
7343       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7344        reduc_fn, ops, vectype_in, reduc_index, masks);
7345
7346   if (reduction_type == EXTRACT_LAST_REDUCTION)
7347     {
7348       gcc_assert (!slp_node);
7349       return vectorizable_condition (stmt, gsi, vec_stmt,
7350                                      NULL, reduc_index, NULL);
7351     }
7352
7353   /* Create the destination vector  */
7354   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7355
7356   prev_stmt_info = NULL;
7357   prev_phi_info = NULL;
7358   if (!slp_node)
7359     {
7360       vec_oprnds0.create (1);
7361       vec_oprnds1.create (1);
7362       if (op_type == ternary_op)
7363         vec_oprnds2.create (1);
7364     }
7365
7366   phis.create (vec_num);
7367   vect_defs.create (vec_num);
7368   if (!slp_node)
7369     vect_defs.quick_push (NULL_TREE);
7370
7371   if (slp_node)
7372     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7373   else
7374     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7375
7376   for (j = 0; j < ncopies; j++)
7377     {
7378       if (code == COND_EXPR)
7379         {
7380           gcc_assert (!slp_node);
7381           vectorizable_condition (stmt, gsi, vec_stmt,
7382                                   PHI_RESULT (phis[0]),
7383                                   reduc_index, NULL);
7384           /* Multiple types are not supported for condition.  */
7385           break;
7386         }
7387
7388       /* Handle uses.  */
7389       if (j == 0)
7390         {
7391           if (slp_node)
7392             {
7393               /* Get vec defs for all the operands except the reduction index,
7394                  ensuring the ordering of the ops in the vector is kept.  */
7395               auto_vec<tree, 3> slp_ops;
7396               auto_vec<vec<tree>, 3> vec_defs;
7397
7398               slp_ops.quick_push (ops[0]);
7399               slp_ops.quick_push (ops[1]);
7400               if (op_type == ternary_op)
7401                 slp_ops.quick_push (ops[2]);
7402
7403               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7404
7405               vec_oprnds0.safe_splice (vec_defs[0]);
7406               vec_defs[0].release ();
7407               vec_oprnds1.safe_splice (vec_defs[1]);
7408               vec_defs[1].release ();
7409               if (op_type == ternary_op)
7410                 {
7411                   vec_oprnds2.safe_splice (vec_defs[2]);
7412                   vec_defs[2].release ();
7413                 }
7414             }
7415           else
7416             {
7417               vec_oprnds0.quick_push
7418                 (vect_get_vec_def_for_operand (ops[0], stmt));
7419               vec_oprnds1.quick_push
7420                 (vect_get_vec_def_for_operand (ops[1], stmt));
7421               if (op_type == ternary_op)
7422                 vec_oprnds2.quick_push
7423                   (vect_get_vec_def_for_operand (ops[2], stmt));
7424             }
7425         }
7426       else
7427         {
7428           if (!slp_node)
7429             {
7430               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7431
7432               if (single_defuse_cycle && reduc_index == 0)
7433                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7434               else
7435                 vec_oprnds0[0]
7436                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7437               if (single_defuse_cycle && reduc_index == 1)
7438                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7439               else
7440                 vec_oprnds1[0]
7441                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7442               if (op_type == ternary_op)
7443                 {
7444                   if (single_defuse_cycle && reduc_index == 2)
7445                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7446                   else
7447                     vec_oprnds2[0]
7448                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7449                 }
7450             }
7451         }
7452
7453       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7454         {
7455           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7456           if (masked_loop_p)
7457             {
7458               /* Make sure that the reduction accumulator is vop[0].  */
7459               if (reduc_index == 1)
7460                 {
7461                   gcc_assert (commutative_tree_code (code));
7462                   std::swap (vop[0], vop[1]);
7463                 }
7464               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7465                                               vectype_in, i * ncopies + j);
7466               gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7467                                                         vop[0], vop[1]);
7468               new_temp = make_ssa_name (vec_dest, call);
7469               gimple_call_set_lhs (call, new_temp);
7470               gimple_call_set_nothrow (call, true);
7471               new_stmt = call;
7472             }
7473           else
7474             {
7475               if (op_type == ternary_op)
7476                 vop[2] = vec_oprnds2[i];
7477
7478               new_temp = make_ssa_name (vec_dest, new_stmt);
7479               new_stmt = gimple_build_assign (new_temp, code,
7480                                               vop[0], vop[1], vop[2]);
7481             }
7482           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7483
7484           if (slp_node)
7485             {
7486               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7487               vect_defs.quick_push (new_temp);
7488             }
7489           else
7490             vect_defs[0] = new_temp;
7491         }
7492
7493       if (slp_node)
7494         continue;
7495
7496       if (j == 0)
7497         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7498       else
7499         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7500
7501       prev_stmt_info = vinfo_for_stmt (new_stmt);
7502     }
7503
7504   /* Finalize the reduction-phi (set its arguments) and create the
7505      epilog reduction code.  */
7506   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7507     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7508
7509   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7510                                     epilog_copies, reduc_fn, phis,
7511                                     double_reduc, slp_node, slp_node_instance,
7512                                     cond_reduc_val, cond_reduc_op_code,
7513                                     neutral_op);
7514
7515   return true;
7516 }
7517
7518 /* Function vect_min_worthwhile_factor.
7519
7520    For a loop where we could vectorize the operation indicated by CODE,
7521    return the minimum vectorization factor that makes it worthwhile
7522    to use generic vectors.  */
7523 static unsigned int
7524 vect_min_worthwhile_factor (enum tree_code code)
7525 {
7526   switch (code)
7527     {
7528     case PLUS_EXPR:
7529     case MINUS_EXPR:
7530     case NEGATE_EXPR:
7531       return 4;
7532
7533     case BIT_AND_EXPR:
7534     case BIT_IOR_EXPR:
7535     case BIT_XOR_EXPR:
7536     case BIT_NOT_EXPR:
7537       return 2;
7538
7539     default:
7540       return INT_MAX;
7541     }
7542 }
7543
7544 /* Return true if VINFO indicates we are doing loop vectorization and if
7545    it is worth decomposing CODE operations into scalar operations for
7546    that loop's vectorization factor.  */
7547
7548 bool
7549 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7550 {
7551   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7552   unsigned HOST_WIDE_INT value;
7553   return (loop_vinfo
7554           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7555           && value >= vect_min_worthwhile_factor (code));
7556 }
7557
7558 /* Function vectorizable_induction
7559
7560    Check if PHI performs an induction computation that can be vectorized.
7561    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7562    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7563    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7564
7565 bool
7566 vectorizable_induction (gimple *phi,
7567                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7568                         gimple **vec_stmt, slp_tree slp_node)
7569 {
7570   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7571   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7572   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7573   unsigned ncopies;
7574   bool nested_in_vect_loop = false;
7575   struct loop *iv_loop;
7576   tree vec_def;
7577   edge pe = loop_preheader_edge (loop);
7578   basic_block new_bb;
7579   tree new_vec, vec_init, vec_step, t;
7580   tree new_name;
7581   gimple *new_stmt;
7582   gphi *induction_phi;
7583   tree induc_def, vec_dest;
7584   tree init_expr, step_expr;
7585   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7586   unsigned i;
7587   tree expr;
7588   gimple_seq stmts;
7589   imm_use_iterator imm_iter;
7590   use_operand_p use_p;
7591   gimple *exit_phi;
7592   edge latch_e;
7593   tree loop_arg;
7594   gimple_stmt_iterator si;
7595   basic_block bb = gimple_bb (phi);
7596
7597   if (gimple_code (phi) != GIMPLE_PHI)
7598     return false;
7599
7600   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7601     return false;
7602
7603   /* Make sure it was recognized as induction computation.  */
7604   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7605     return false;
7606
7607   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7608   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7609
7610   if (slp_node)
7611     ncopies = 1;
7612   else
7613     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7614   gcc_assert (ncopies >= 1);
7615
7616   /* FORNOW. These restrictions should be relaxed.  */
7617   if (nested_in_vect_loop_p (loop, phi))
7618     {
7619       imm_use_iterator imm_iter;
7620       use_operand_p use_p;
7621       gimple *exit_phi;
7622       edge latch_e;
7623       tree loop_arg;
7624
7625       if (ncopies > 1)
7626         {
7627           if (dump_enabled_p ())
7628             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7629                              "multiple types in nested loop.\n");
7630           return false;
7631         }
7632
7633       /* FORNOW: outer loop induction with SLP not supported.  */
7634       if (STMT_SLP_TYPE (stmt_info))
7635         return false;
7636
7637       exit_phi = NULL;
7638       latch_e = loop_latch_edge (loop->inner);
7639       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7640       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7641         {
7642           gimple *use_stmt = USE_STMT (use_p);
7643           if (is_gimple_debug (use_stmt))
7644             continue;
7645
7646           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7647             {
7648               exit_phi = use_stmt;
7649               break;
7650             }
7651         }
7652       if (exit_phi)
7653         {
7654           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7655           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7656                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7657             {
7658               if (dump_enabled_p ())
7659                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7660                                  "inner-loop induction only used outside "
7661                                  "of the outer vectorized loop.\n");
7662               return false;
7663             }
7664         }
7665
7666       nested_in_vect_loop = true;
7667       iv_loop = loop->inner;
7668     }
7669   else
7670     iv_loop = loop;
7671   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7672
7673   if (slp_node && !nunits.is_constant ())
7674     {
7675       /* The current SLP code creates the initial value element-by-element.  */
7676       if (dump_enabled_p ())
7677         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7678                          "SLP induction not supported for variable-length"
7679                          " vectors.\n");
7680       return false;
7681     }
7682
7683   if (!vec_stmt) /* transformation not required.  */
7684     {
7685       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7686       if (dump_enabled_p ())
7687         dump_printf_loc (MSG_NOTE, vect_location,
7688                          "=== vectorizable_induction ===\n");
7689       vect_model_induction_cost (stmt_info, ncopies);
7690       return true;
7691     }
7692
7693   /* Transform.  */
7694
7695   /* Compute a vector variable, initialized with the first VF values of
7696      the induction variable.  E.g., for an iv with IV_PHI='X' and
7697      evolution S, for a vector of 4 units, we want to compute:
7698      [X, X + S, X + 2*S, X + 3*S].  */
7699
7700   if (dump_enabled_p ())
7701     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7702
7703   latch_e = loop_latch_edge (iv_loop);
7704   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7705
7706   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7707   gcc_assert (step_expr != NULL_TREE);
7708
7709   pe = loop_preheader_edge (iv_loop);
7710   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7711                                      loop_preheader_edge (iv_loop));
7712
7713   stmts = NULL;
7714   if (!nested_in_vect_loop)
7715     {
7716       /* Convert the initial value to the desired type.  */
7717       tree new_type = TREE_TYPE (vectype);
7718       init_expr = gimple_convert (&stmts, new_type, init_expr);
7719
7720       /* If we are using the loop mask to "peel" for alignment then we need
7721          to adjust the start value here.  */
7722       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7723       if (skip_niters != NULL_TREE)
7724         {
7725           if (FLOAT_TYPE_P (vectype))
7726             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7727                                         skip_niters);
7728           else
7729             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7730           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7731                                          skip_niters, step_expr);
7732           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7733                                     init_expr, skip_step);
7734         }
7735     }
7736
7737   /* Convert the step to the desired type.  */
7738   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7739
7740   if (stmts)
7741     {
7742       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7743       gcc_assert (!new_bb);
7744     }
7745
7746   /* Find the first insertion point in the BB.  */
7747   si = gsi_after_labels (bb);
7748
7749   /* For SLP induction we have to generate several IVs as for example
7750      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7751      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7752      [VF*S, VF*S, VF*S, VF*S] for all.  */
7753   if (slp_node)
7754     {
7755       /* Enforced above.  */
7756       unsigned int const_nunits = nunits.to_constant ();
7757
7758       /* Generate [VF*S, VF*S, ... ].  */
7759       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7760         {
7761           expr = build_int_cst (integer_type_node, vf);
7762           expr = fold_convert (TREE_TYPE (step_expr), expr);
7763         }
7764       else
7765         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7766       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7767                               expr, step_expr);
7768       if (! CONSTANT_CLASS_P (new_name))
7769         new_name = vect_init_vector (phi, new_name,
7770                                      TREE_TYPE (step_expr), NULL);
7771       new_vec = build_vector_from_val (vectype, new_name);
7772       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7773
7774       /* Now generate the IVs.  */
7775       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7776       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7777       unsigned elts = const_nunits * nvects;
7778       unsigned nivs = least_common_multiple (group_size,
7779                                              const_nunits) / const_nunits;
7780       gcc_assert (elts % group_size == 0);
7781       tree elt = init_expr;
7782       unsigned ivn;
7783       for (ivn = 0; ivn < nivs; ++ivn)
7784         {
7785           tree_vector_builder elts (vectype, const_nunits, 1);
7786           stmts = NULL;
7787           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7788             {
7789               if (ivn*const_nunits + eltn >= group_size
7790                   && (ivn * const_nunits + eltn) % group_size == 0)
7791                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7792                                     elt, step_expr);
7793               elts.quick_push (elt);
7794             }
7795           vec_init = gimple_build_vector (&stmts, &elts);
7796           if (stmts)
7797             {
7798               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7799               gcc_assert (!new_bb);
7800             }
7801
7802           /* Create the induction-phi that defines the induction-operand.  */
7803           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7804           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7805           set_vinfo_for_stmt (induction_phi,
7806                               new_stmt_vec_info (induction_phi, loop_vinfo));
7807           induc_def = PHI_RESULT (induction_phi);
7808
7809           /* Create the iv update inside the loop  */
7810           vec_def = make_ssa_name (vec_dest);
7811           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7812           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7813           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7814
7815           /* Set the arguments of the phi node:  */
7816           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7817           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7818                        UNKNOWN_LOCATION);
7819
7820           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7821         }
7822
7823       /* Re-use IVs when we can.  */
7824       if (ivn < nvects)
7825         {
7826           unsigned vfp
7827             = least_common_multiple (group_size, const_nunits) / group_size;
7828           /* Generate [VF'*S, VF'*S, ... ].  */
7829           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7830             {
7831               expr = build_int_cst (integer_type_node, vfp);
7832               expr = fold_convert (TREE_TYPE (step_expr), expr);
7833             }
7834           else
7835             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7836           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7837                                   expr, step_expr);
7838           if (! CONSTANT_CLASS_P (new_name))
7839             new_name = vect_init_vector (phi, new_name,
7840                                          TREE_TYPE (step_expr), NULL);
7841           new_vec = build_vector_from_val (vectype, new_name);
7842           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7843           for (; ivn < nvects; ++ivn)
7844             {
7845               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7846               tree def;
7847               if (gimple_code (iv) == GIMPLE_PHI)
7848                 def = gimple_phi_result (iv);
7849               else
7850                 def = gimple_assign_lhs (iv);
7851               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7852                                               PLUS_EXPR,
7853                                               def, vec_step);
7854               if (gimple_code (iv) == GIMPLE_PHI)
7855                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7856               else
7857                 {
7858                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7859                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7860                 }
7861               set_vinfo_for_stmt (new_stmt,
7862                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7863               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7864             }
7865         }
7866
7867       return true;
7868     }
7869
7870   /* Create the vector that holds the initial_value of the induction.  */
7871   if (nested_in_vect_loop)
7872     {
7873       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7874          been created during vectorization of previous stmts.  We obtain it
7875          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7876       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7877       /* If the initial value is not of proper type, convert it.  */
7878       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7879         {
7880           new_stmt
7881             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7882                                                           vect_simple_var,
7883                                                           "vec_iv_"),
7884                                    VIEW_CONVERT_EXPR,
7885                                    build1 (VIEW_CONVERT_EXPR, vectype,
7886                                            vec_init));
7887           vec_init = gimple_assign_lhs (new_stmt);
7888           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7889                                                  new_stmt);
7890           gcc_assert (!new_bb);
7891           set_vinfo_for_stmt (new_stmt,
7892                               new_stmt_vec_info (new_stmt, loop_vinfo));
7893         }
7894     }
7895   else
7896     {
7897       /* iv_loop is the loop to be vectorized. Create:
7898          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7899       stmts = NULL;
7900       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7901
7902       unsigned HOST_WIDE_INT const_nunits;
7903       if (nunits.is_constant (&const_nunits))
7904         {
7905           tree_vector_builder elts (vectype, const_nunits, 1);
7906           elts.quick_push (new_name);
7907           for (i = 1; i < const_nunits; i++)
7908             {
7909               /* Create: new_name_i = new_name + step_expr  */
7910               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7911                                        new_name, step_expr);
7912               elts.quick_push (new_name);
7913             }
7914           /* Create a vector from [new_name_0, new_name_1, ...,
7915              new_name_nunits-1]  */
7916           vec_init = gimple_build_vector (&stmts, &elts);
7917         }
7918       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7919         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7920         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7921                                  new_name, step_expr);
7922       else
7923         {
7924           /* Build:
7925                 [base, base, base, ...]
7926                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7927           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7928           gcc_assert (flag_associative_math);
7929           tree index = build_index_vector (vectype, 0, 1);
7930           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7931                                                         new_name);
7932           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7933                                                         step_expr);
7934           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7935           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7936                                    vec_init, step_vec);
7937           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7938                                    vec_init, base_vec);
7939         }
7940
7941       if (stmts)
7942         {
7943           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7944           gcc_assert (!new_bb);
7945         }
7946     }
7947
7948
7949   /* Create the vector that holds the step of the induction.  */
7950   if (nested_in_vect_loop)
7951     /* iv_loop is nested in the loop to be vectorized. Generate:
7952        vec_step = [S, S, S, S]  */
7953     new_name = step_expr;
7954   else
7955     {
7956       /* iv_loop is the loop to be vectorized. Generate:
7957           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7958       gimple_seq seq = NULL;
7959       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7960         {
7961           expr = build_int_cst (integer_type_node, vf);
7962           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7963         }
7964       else
7965         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7966       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7967                                expr, step_expr);
7968       if (seq)
7969         {
7970           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7971           gcc_assert (!new_bb);
7972         }
7973     }
7974
7975   t = unshare_expr (new_name);
7976   gcc_assert (CONSTANT_CLASS_P (new_name)
7977               || TREE_CODE (new_name) == SSA_NAME);
7978   new_vec = build_vector_from_val (vectype, t);
7979   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7980
7981
7982   /* Create the following def-use cycle:
7983      loop prolog:
7984          vec_init = ...
7985          vec_step = ...
7986      loop:
7987          vec_iv = PHI <vec_init, vec_loop>
7988          ...
7989          STMT
7990          ...
7991          vec_loop = vec_iv + vec_step;  */
7992
7993   /* Create the induction-phi that defines the induction-operand.  */
7994   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7995   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7996   set_vinfo_for_stmt (induction_phi,
7997                       new_stmt_vec_info (induction_phi, loop_vinfo));
7998   induc_def = PHI_RESULT (induction_phi);
7999
8000   /* Create the iv update inside the loop  */
8001   vec_def = make_ssa_name (vec_dest);
8002   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8003   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8004   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8005
8006   /* Set the arguments of the phi node:  */
8007   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8008   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8009                UNKNOWN_LOCATION);
8010
8011   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8012
8013   /* In case that vectorization factor (VF) is bigger than the number
8014      of elements that we can fit in a vectype (nunits), we have to generate
8015      more than one vector stmt - i.e - we need to "unroll" the
8016      vector stmt by a factor VF/nunits.  For more details see documentation
8017      in vectorizable_operation.  */
8018
8019   if (ncopies > 1)
8020     {
8021       gimple_seq seq = NULL;
8022       stmt_vec_info prev_stmt_vinfo;
8023       /* FORNOW. This restriction should be relaxed.  */
8024       gcc_assert (!nested_in_vect_loop);
8025
8026       /* Create the vector that holds the step of the induction.  */
8027       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8028         {
8029           expr = build_int_cst (integer_type_node, nunits);
8030           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8031         }
8032       else
8033         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8034       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8035                                expr, step_expr);
8036       if (seq)
8037         {
8038           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8039           gcc_assert (!new_bb);
8040         }
8041
8042       t = unshare_expr (new_name);
8043       gcc_assert (CONSTANT_CLASS_P (new_name)
8044                   || TREE_CODE (new_name) == SSA_NAME);
8045       new_vec = build_vector_from_val (vectype, t);
8046       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8047
8048       vec_def = induc_def;
8049       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8050       for (i = 1; i < ncopies; i++)
8051         {
8052           /* vec_i = vec_prev + vec_step  */
8053           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8054                                           vec_def, vec_step);
8055           vec_def = make_ssa_name (vec_dest, new_stmt);
8056           gimple_assign_set_lhs (new_stmt, vec_def);
8057
8058           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8059           set_vinfo_for_stmt (new_stmt,
8060                               new_stmt_vec_info (new_stmt, loop_vinfo));
8061           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8062           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8063         }
8064     }
8065
8066   if (nested_in_vect_loop)
8067     {
8068       /* Find the loop-closed exit-phi of the induction, and record
8069          the final vector of induction results:  */
8070       exit_phi = NULL;
8071       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8072         {
8073           gimple *use_stmt = USE_STMT (use_p);
8074           if (is_gimple_debug (use_stmt))
8075             continue;
8076
8077           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8078             {
8079               exit_phi = use_stmt;
8080               break;
8081             }
8082         }
8083       if (exit_phi)
8084         {
8085           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8086           /* FORNOW. Currently not supporting the case that an inner-loop induction
8087              is not used in the outer-loop (i.e. only outside the outer-loop).  */
8088           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8089                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
8090
8091           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8092           if (dump_enabled_p ())
8093             {
8094               dump_printf_loc (MSG_NOTE, vect_location,
8095                                "vector of inductions after inner-loop:");
8096               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8097             }
8098         }
8099     }
8100
8101
8102   if (dump_enabled_p ())
8103     {
8104       dump_printf_loc (MSG_NOTE, vect_location,
8105                        "transform induction: created def-use cycle: ");
8106       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8107       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8108                         SSA_NAME_DEF_STMT (vec_def), 0);
8109     }
8110
8111   return true;
8112 }
8113
8114 /* Function vectorizable_live_operation.
8115
8116    STMT computes a value that is used outside the loop.  Check if
8117    it can be supported.  */
8118
8119 bool
8120 vectorizable_live_operation (gimple *stmt,
8121                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8122                              slp_tree slp_node, int slp_index,
8123                              gimple **vec_stmt)
8124 {
8125   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8126   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8127   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8128   imm_use_iterator imm_iter;
8129   tree lhs, lhs_type, bitsize, vec_bitsize;
8130   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8131   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8132   int ncopies;
8133   gimple *use_stmt;
8134   auto_vec<tree> vec_oprnds;
8135   int vec_entry = 0;
8136   poly_uint64 vec_index = 0;
8137
8138   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8139
8140   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8141     return false;
8142
8143   /* FORNOW.  CHECKME.  */
8144   if (nested_in_vect_loop_p (loop, stmt))
8145     return false;
8146
8147   /* If STMT is not relevant and it is a simple assignment and its inputs are
8148      invariant then it can remain in place, unvectorized.  The original last
8149      scalar value that it computes will be used.  */
8150   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8151     {
8152       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8153       if (dump_enabled_p ())
8154         dump_printf_loc (MSG_NOTE, vect_location,
8155                          "statement is simple and uses invariant.  Leaving in "
8156                          "place.\n");
8157       return true;
8158     }
8159
8160   if (slp_node)
8161     ncopies = 1;
8162   else
8163     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8164
8165   if (slp_node)
8166     {
8167       gcc_assert (slp_index >= 0);
8168
8169       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8170       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8171
8172       /* Get the last occurrence of the scalar index from the concatenation of
8173          all the slp vectors. Calculate which slp vector it is and the index
8174          within.  */
8175       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8176
8177       /* Calculate which vector contains the result, and which lane of
8178          that vector we need.  */
8179       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8180         {
8181           if (dump_enabled_p ())
8182             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8183                              "Cannot determine which vector holds the"
8184                              " final result.\n");
8185           return false;
8186         }
8187     }
8188
8189   if (!vec_stmt)
8190     {
8191       /* No transformation required.  */
8192       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8193         {
8194           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8195                                                OPTIMIZE_FOR_SPEED))
8196             {
8197               if (dump_enabled_p ())
8198                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8199                                  "can't use a fully-masked loop because "
8200                                  "the target doesn't support extract last "
8201                                  "reduction.\n");
8202               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8203             }
8204           else if (slp_node)
8205             {
8206               if (dump_enabled_p ())
8207                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8208                                  "can't use a fully-masked loop because an "
8209                                  "SLP statement is live after the loop.\n");
8210               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8211             }
8212           else if (ncopies > 1)
8213             {
8214               if (dump_enabled_p ())
8215                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8216                                  "can't use a fully-masked loop because"
8217                                  " ncopies is greater than 1.\n");
8218               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8219             }
8220           else
8221             {
8222               gcc_assert (ncopies == 1 && !slp_node);
8223               vect_record_loop_mask (loop_vinfo,
8224                                      &LOOP_VINFO_MASKS (loop_vinfo),
8225                                      1, vectype);
8226             }
8227         }
8228       return true;
8229     }
8230
8231   /* If stmt has a related stmt, then use that for getting the lhs.  */
8232   if (is_pattern_stmt_p (stmt_info))
8233     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8234
8235   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8236         : gimple_get_lhs (stmt);
8237   lhs_type = TREE_TYPE (lhs);
8238
8239   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8240              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8241              : TYPE_SIZE (TREE_TYPE (vectype)));
8242   vec_bitsize = TYPE_SIZE (vectype);
8243
8244   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8245   tree vec_lhs, bitstart;
8246   if (slp_node)
8247     {
8248       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8249
8250       /* Get the correct slp vectorized stmt.  */
8251       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8252       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8253         vec_lhs = gimple_phi_result (phi);
8254       else
8255         vec_lhs = gimple_get_lhs (vec_stmt);
8256
8257       /* Get entry to use.  */
8258       bitstart = bitsize_int (vec_index);
8259       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8260     }
8261   else
8262     {
8263       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8264       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8265       gcc_checking_assert (ncopies == 1
8266                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8267
8268       /* For multiple copies, get the last copy.  */
8269       for (int i = 1; i < ncopies; ++i)
8270         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8271                                                   vec_lhs);
8272
8273       /* Get the last lane in the vector.  */
8274       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8275     }
8276
8277   gimple_seq stmts = NULL;
8278   tree new_tree;
8279   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8280     {
8281       /* Emit:
8282
8283            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8284
8285          where VEC_LHS is the vectorized live-out result and MASK is
8286          the loop mask for the final iteration.  */
8287       gcc_assert (ncopies == 1 && !slp_node);
8288       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8289       tree scalar_res = make_ssa_name (scalar_type);
8290       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8291                                       1, vectype, 0);
8292       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8293                                                     2, mask, vec_lhs);
8294       gimple_call_set_lhs (new_stmt, scalar_res);
8295       gimple_seq_add_stmt (&stmts, new_stmt);
8296
8297       /* Convert the extracted vector element to the required scalar type.  */
8298       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8299     }
8300   else
8301     {
8302       tree bftype = TREE_TYPE (vectype);
8303       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8304         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8305       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8306       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8307                                        &stmts, true, NULL_TREE);
8308     }
8309
8310   if (stmts)
8311     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8312
8313   /* Replace use of lhs with newly computed result.  If the use stmt is a
8314      single arg PHI, just replace all uses of PHI result.  It's necessary
8315      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8316   use_operand_p use_p;
8317   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8318     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8319         && !is_gimple_debug (use_stmt))
8320     {
8321       if (gimple_code (use_stmt) == GIMPLE_PHI
8322           && gimple_phi_num_args (use_stmt) == 1)
8323         {
8324           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8325         }
8326       else
8327         {
8328           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8329             SET_USE (use_p, new_tree);
8330         }
8331       update_stmt (use_stmt);
8332     }
8333
8334   return true;
8335 }
8336
8337 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8338
8339 static void
8340 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8341 {
8342   ssa_op_iter op_iter;
8343   imm_use_iterator imm_iter;
8344   def_operand_p def_p;
8345   gimple *ustmt;
8346
8347   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8348     {
8349       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8350         {
8351           basic_block bb;
8352
8353           if (!is_gimple_debug (ustmt))
8354             continue;
8355
8356           bb = gimple_bb (ustmt);
8357
8358           if (!flow_bb_inside_loop_p (loop, bb))
8359             {
8360               if (gimple_debug_bind_p (ustmt))
8361                 {
8362                   if (dump_enabled_p ())
8363                     dump_printf_loc (MSG_NOTE, vect_location,
8364                                      "killing debug use\n");
8365
8366                   gimple_debug_bind_reset_value (ustmt);
8367                   update_stmt (ustmt);
8368                 }
8369               else
8370                 gcc_unreachable ();
8371             }
8372         }
8373     }
8374 }
8375
8376 /* Given loop represented by LOOP_VINFO, return true if computation of
8377    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8378    otherwise.  */
8379
8380 static bool
8381 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8382 {
8383   /* Constant case.  */
8384   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8385     {
8386       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8387       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8388
8389       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8390       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8391       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8392         return true;
8393     }
8394
8395   widest_int max;
8396   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8397   /* Check the upper bound of loop niters.  */
8398   if (get_max_loop_iterations (loop, &max))
8399     {
8400       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8401       signop sgn = TYPE_SIGN (type);
8402       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8403       if (max < type_max)
8404         return true;
8405     }
8406   return false;
8407 }
8408
8409 /* Return a mask type with half the number of elements as TYPE.  */
8410
8411 tree
8412 vect_halve_mask_nunits (tree type)
8413 {
8414   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8415   return build_truth_vector_type (nunits, current_vector_size);
8416 }
8417
8418 /* Return a mask type with twice as many elements as TYPE.  */
8419
8420 tree
8421 vect_double_mask_nunits (tree type)
8422 {
8423   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8424   return build_truth_vector_type (nunits, current_vector_size);
8425 }
8426
8427 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8428    contain a sequence of NVECTORS masks that each control a vector of type
8429    VECTYPE.  */
8430
8431 void
8432 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8433                        unsigned int nvectors, tree vectype)
8434 {
8435   gcc_assert (nvectors != 0);
8436   if (masks->length () < nvectors)
8437     masks->safe_grow_cleared (nvectors);
8438   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8439   /* The number of scalars per iteration and the number of vectors are
8440      both compile-time constants.  */
8441   unsigned int nscalars_per_iter
8442     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8443                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8444   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8445     {
8446       rgm->max_nscalars_per_iter = nscalars_per_iter;
8447       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8448     }
8449 }
8450
8451 /* Given a complete set of masks MASKS, extract mask number INDEX
8452    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8453    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8454
8455    See the comment above vec_loop_masks for more details about the mask
8456    arrangement.  */
8457
8458 tree
8459 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8460                     unsigned int nvectors, tree vectype, unsigned int index)
8461 {
8462   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8463   tree mask_type = rgm->mask_type;
8464
8465   /* Populate the rgroup's mask array, if this is the first time we've
8466      used it.  */
8467   if (rgm->masks.is_empty ())
8468     {
8469       rgm->masks.safe_grow_cleared (nvectors);
8470       for (unsigned int i = 0; i < nvectors; ++i)
8471         {
8472           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8473           /* Provide a dummy definition until the real one is available.  */
8474           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8475           rgm->masks[i] = mask;
8476         }
8477     }
8478
8479   tree mask = rgm->masks[index];
8480   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8481                 TYPE_VECTOR_SUBPARTS (vectype)))
8482     {
8483       /* A loop mask for data type X can be reused for data type Y
8484          if X has N times more elements than Y and if Y's elements
8485          are N times bigger than X's.  In this case each sequence
8486          of N elements in the loop mask will be all-zero or all-one.
8487          We can then view-convert the mask so that each sequence of
8488          N elements is replaced by a single element.  */
8489       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8490                               TYPE_VECTOR_SUBPARTS (vectype)));
8491       gimple_seq seq = NULL;
8492       mask_type = build_same_sized_truth_vector_type (vectype);
8493       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8494       if (seq)
8495         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8496     }
8497   return mask;
8498 }
8499
8500 /* Scale profiling counters by estimation for LOOP which is vectorized
8501    by factor VF.  */
8502
8503 static void
8504 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8505 {
8506   edge preheader = loop_preheader_edge (loop);
8507   /* Reduce loop iterations by the vectorization factor.  */
8508   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8509   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8510
8511   if (freq_h.nonzero_p ())
8512     {
8513       profile_probability p;
8514
8515       /* Avoid dropping loop body profile counter to 0 because of zero count
8516          in loop's preheader.  */
8517       if (!(freq_e == profile_count::zero ()))
8518         freq_e = freq_e.force_nonzero ();
8519       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8520       scale_loop_frequencies (loop, p);
8521     }
8522
8523   edge exit_e = single_exit (loop);
8524   exit_e->probability = profile_probability::always ()
8525                                  .apply_scale (1, new_est_niter + 1);
8526
8527   edge exit_l = single_pred_edge (loop->latch);
8528   profile_probability prob = exit_l->probability;
8529   exit_l->probability = exit_e->probability.invert ();
8530   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8531     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8532 }
8533
8534 /* Function vect_transform_loop.
8535
8536    The analysis phase has determined that the loop is vectorizable.
8537    Vectorize the loop - created vectorized stmts to replace the scalar
8538    stmts in the loop, and update the loop exit condition.
8539    Returns scalar epilogue loop if any.  */
8540
8541 struct loop *
8542 vect_transform_loop (loop_vec_info loop_vinfo)
8543 {
8544   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8545   struct loop *epilogue = NULL;
8546   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8547   int nbbs = loop->num_nodes;
8548   int i;
8549   tree niters_vector = NULL_TREE;
8550   tree step_vector = NULL_TREE;
8551   tree niters_vector_mult_vf = NULL_TREE;
8552   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8553   unsigned int lowest_vf = constant_lower_bound (vf);
8554   bool grouped_store;
8555   bool slp_scheduled = false;
8556   gimple *stmt, *pattern_stmt;
8557   gimple_seq pattern_def_seq = NULL;
8558   gimple_stmt_iterator pattern_def_si = gsi_none ();
8559   bool transform_pattern_stmt = false;
8560   bool check_profitability = false;
8561   unsigned int th;
8562
8563   if (dump_enabled_p ())
8564     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8565
8566   /* Use the more conservative vectorization threshold.  If the number
8567      of iterations is constant assume the cost check has been performed
8568      by our caller.  If the threshold makes all loops profitable that
8569      run at least the (estimated) vectorization factor number of times
8570      checking is pointless, too.  */
8571   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8572   if (th >= vect_vf_for_cost (loop_vinfo)
8573       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8574     {
8575       if (dump_enabled_p ())
8576         dump_printf_loc (MSG_NOTE, vect_location,
8577                          "Profitability threshold is %d loop iterations.\n",
8578                          th);
8579       check_profitability = true;
8580     }
8581
8582   /* Make sure there exists a single-predecessor exit bb.  Do this before
8583      versioning.   */
8584   edge e = single_exit (loop);
8585   if (! single_pred_p (e->dest))
8586     {
8587       split_loop_exit_edge (e);
8588       if (dump_enabled_p ())
8589         dump_printf (MSG_NOTE, "split exit edge\n");
8590     }
8591
8592   /* Version the loop first, if required, so the profitability check
8593      comes first.  */
8594
8595   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8596     {
8597       poly_uint64 versioning_threshold
8598         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8599       if (check_profitability
8600           && ordered_p (poly_uint64 (th), versioning_threshold))
8601         {
8602           versioning_threshold = ordered_max (poly_uint64 (th),
8603                                               versioning_threshold);
8604           check_profitability = false;
8605         }
8606       vect_loop_versioning (loop_vinfo, th, check_profitability,
8607                             versioning_threshold);
8608       check_profitability = false;
8609     }
8610
8611   /* Make sure there exists a single-predecessor exit bb also on the
8612      scalar loop copy.  Do this after versioning but before peeling
8613      so CFG structure is fine for both scalar and if-converted loop
8614      to make slpeel_duplicate_current_defs_from_edges face matched
8615      loop closed PHI nodes on the exit.  */
8616   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8617     {
8618       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8619       if (! single_pred_p (e->dest))
8620         {
8621           split_loop_exit_edge (e);
8622           if (dump_enabled_p ())
8623             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8624         }
8625     }
8626
8627   tree niters = vect_build_loop_niters (loop_vinfo);
8628   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8629   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8630   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8631   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8632                               &step_vector, &niters_vector_mult_vf, th,
8633                               check_profitability, niters_no_overflow);
8634
8635   if (niters_vector == NULL_TREE)
8636     {
8637       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8638           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8639           && known_eq (lowest_vf, vf))
8640         {
8641           niters_vector
8642             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8643                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8644           step_vector = build_one_cst (TREE_TYPE (niters));
8645         }
8646       else
8647         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8648                                      &step_vector, niters_no_overflow);
8649     }
8650
8651   /* 1) Make sure the loop header has exactly two entries
8652      2) Make sure we have a preheader basic block.  */
8653
8654   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8655
8656   split_edge (loop_preheader_edge (loop));
8657
8658   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8659       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8660     /* This will deal with any possible peeling.  */
8661     vect_prepare_for_masked_peels (loop_vinfo);
8662
8663   /* FORNOW: the vectorizer supports only loops which body consist
8664      of one basic block (header + empty latch). When the vectorizer will
8665      support more involved loop forms, the order by which the BBs are
8666      traversed need to be reconsidered.  */
8667
8668   for (i = 0; i < nbbs; i++)
8669     {
8670       basic_block bb = bbs[i];
8671       stmt_vec_info stmt_info;
8672
8673       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8674            gsi_next (&si))
8675         {
8676           gphi *phi = si.phi ();
8677           if (dump_enabled_p ())
8678             {
8679               dump_printf_loc (MSG_NOTE, vect_location,
8680                                "------>vectorizing phi: ");
8681               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8682             }
8683           stmt_info = vinfo_for_stmt (phi);
8684           if (!stmt_info)
8685             continue;
8686
8687           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8688             vect_loop_kill_debug_uses (loop, phi);
8689
8690           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8691               && !STMT_VINFO_LIVE_P (stmt_info))
8692             continue;
8693
8694           if (STMT_VINFO_VECTYPE (stmt_info)
8695               && (maybe_ne
8696                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8697               && dump_enabled_p ())
8698             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8699
8700           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8701                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8702                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8703               && ! PURE_SLP_STMT (stmt_info))
8704             {
8705               if (dump_enabled_p ())
8706                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8707               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8708             }
8709         }
8710
8711       pattern_stmt = NULL;
8712       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8713            !gsi_end_p (si) || transform_pattern_stmt;)
8714         {
8715           bool is_store;
8716
8717           if (transform_pattern_stmt)
8718             stmt = pattern_stmt;
8719           else
8720             {
8721               stmt = gsi_stmt (si);
8722               /* During vectorization remove existing clobber stmts.  */
8723               if (gimple_clobber_p (stmt))
8724                 {
8725                   unlink_stmt_vdef (stmt);
8726                   gsi_remove (&si, true);
8727                   release_defs (stmt);
8728                   continue;
8729                 }
8730             }
8731
8732           if (dump_enabled_p ())
8733             {
8734               dump_printf_loc (MSG_NOTE, vect_location,
8735                                "------>vectorizing statement: ");
8736               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8737             }
8738
8739           stmt_info = vinfo_for_stmt (stmt);
8740
8741           /* vector stmts created in the outer-loop during vectorization of
8742              stmts in an inner-loop may not have a stmt_info, and do not
8743              need to be vectorized.  */
8744           if (!stmt_info)
8745             {
8746               gsi_next (&si);
8747               continue;
8748             }
8749
8750           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8751             vect_loop_kill_debug_uses (loop, stmt);
8752
8753           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8754               && !STMT_VINFO_LIVE_P (stmt_info))
8755             {
8756               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8757                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8758                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8759                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8760                 {
8761                   stmt = pattern_stmt;
8762                   stmt_info = vinfo_for_stmt (stmt);
8763                 }
8764               else
8765                 {
8766                   gsi_next (&si);
8767                   continue;
8768                 }
8769             }
8770           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8771                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8772                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8773                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8774             transform_pattern_stmt = true;
8775
8776           /* If pattern statement has def stmts, vectorize them too.  */
8777           if (is_pattern_stmt_p (stmt_info))
8778             {
8779               if (pattern_def_seq == NULL)
8780                 {
8781                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8782                   pattern_def_si = gsi_start (pattern_def_seq);
8783                 }
8784               else if (!gsi_end_p (pattern_def_si))
8785                 gsi_next (&pattern_def_si);
8786               if (pattern_def_seq != NULL)
8787                 {
8788                   gimple *pattern_def_stmt = NULL;
8789                   stmt_vec_info pattern_def_stmt_info = NULL;
8790
8791                   while (!gsi_end_p (pattern_def_si))
8792                     {
8793                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8794                       pattern_def_stmt_info
8795                         = vinfo_for_stmt (pattern_def_stmt);
8796                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8797                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8798                         break;
8799                       gsi_next (&pattern_def_si);
8800                     }
8801
8802                   if (!gsi_end_p (pattern_def_si))
8803                     {
8804                       if (dump_enabled_p ())
8805                         {
8806                           dump_printf_loc (MSG_NOTE, vect_location,
8807                                            "==> vectorizing pattern def "
8808                                            "stmt: ");
8809                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8810                                             pattern_def_stmt, 0);
8811                         }
8812
8813                       stmt = pattern_def_stmt;
8814                       stmt_info = pattern_def_stmt_info;
8815                     }
8816                   else
8817                     {
8818                       pattern_def_si = gsi_none ();
8819                       transform_pattern_stmt = false;
8820                     }
8821                 }
8822               else
8823                 transform_pattern_stmt = false;
8824             }
8825
8826           if (STMT_VINFO_VECTYPE (stmt_info))
8827             {
8828               poly_uint64 nunits
8829                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8830               if (!STMT_SLP_TYPE (stmt_info)
8831                   && maybe_ne (nunits, vf)
8832                   && dump_enabled_p ())
8833                   /* For SLP VF is set according to unrolling factor, and not
8834                      to vector size, hence for SLP this print is not valid.  */
8835                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8836             }
8837
8838           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8839              reached.  */
8840           if (STMT_SLP_TYPE (stmt_info))
8841             {
8842               if (!slp_scheduled)
8843                 {
8844                   slp_scheduled = true;
8845
8846                   if (dump_enabled_p ())
8847                     dump_printf_loc (MSG_NOTE, vect_location,
8848                                      "=== scheduling SLP instances ===\n");
8849
8850                   vect_schedule_slp (loop_vinfo);
8851                 }
8852
8853               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8854               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8855                 {
8856                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8857                     {
8858                       pattern_def_seq = NULL;
8859                       gsi_next (&si);
8860                     }
8861                   continue;
8862                 }
8863             }
8864
8865           /* -------- vectorize statement ------------ */
8866           if (dump_enabled_p ())
8867             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8868
8869           grouped_store = false;
8870           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8871           if (is_store)
8872             {
8873               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8874                 {
8875                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8876                      interleaving chain was completed - free all the stores in
8877                      the chain.  */
8878                   gsi_next (&si);
8879                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8880                 }
8881               else
8882                 {
8883                   /* Free the attached stmt_vec_info and remove the stmt.  */
8884                   gimple *store = gsi_stmt (si);
8885                   free_stmt_vec_info (store);
8886                   unlink_stmt_vdef (store);
8887                   gsi_remove (&si, true);
8888                   release_defs (store);
8889                 }
8890
8891               /* Stores can only appear at the end of pattern statements.  */
8892               gcc_assert (!transform_pattern_stmt);
8893               pattern_def_seq = NULL;
8894             }
8895           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8896             {
8897               pattern_def_seq = NULL;
8898               gsi_next (&si);
8899             }
8900         }                       /* stmts in BB */
8901
8902       /* Stub out scalar statements that must not survive vectorization.
8903          Doing this here helps with grouped statements, or statements that
8904          are involved in patterns.  */
8905       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8906            !gsi_end_p (gsi); gsi_next (&gsi))
8907         {
8908           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8909           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8910             {
8911               tree lhs = gimple_get_lhs (call);
8912               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8913                 {
8914                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8915                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8916                   gsi_replace (&gsi, new_stmt, true);
8917                 }
8918             }
8919         }
8920     }                           /* BBs in loop */
8921
8922   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8923      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8924   if (integer_onep (step_vector))
8925     niters_no_overflow = true;
8926   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8927                            niters_vector_mult_vf, !niters_no_overflow);
8928
8929   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8930   scale_profile_for_vect_loop (loop, assumed_vf);
8931
8932   /* True if the final iteration might not handle a full vector's
8933      worth of scalar iterations.  */
8934   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8935   /* The minimum number of iterations performed by the epilogue.  This
8936      is 1 when peeling for gaps because we always need a final scalar
8937      iteration.  */
8938   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8939   /* +1 to convert latch counts to loop iteration counts,
8940      -min_epilogue_iters to remove iterations that cannot be performed
8941        by the vector code.  */
8942   int bias_for_lowest = 1 - min_epilogue_iters;
8943   int bias_for_assumed = bias_for_lowest;
8944   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8945   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8946     {
8947       /* When the amount of peeling is known at compile time, the first
8948          iteration will have exactly alignment_npeels active elements.
8949          In the worst case it will have at least one.  */
8950       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8951       bias_for_lowest += lowest_vf - min_first_active;
8952       bias_for_assumed += assumed_vf - min_first_active;
8953     }
8954   /* In these calculations the "- 1" converts loop iteration counts
8955      back to latch counts.  */
8956   if (loop->any_upper_bound)
8957     loop->nb_iterations_upper_bound
8958       = (final_iter_may_be_partial
8959          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8960                           lowest_vf) - 1
8961          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8962                            lowest_vf) - 1);
8963   if (loop->any_likely_upper_bound)
8964     loop->nb_iterations_likely_upper_bound
8965       = (final_iter_may_be_partial
8966          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8967                           + bias_for_lowest, lowest_vf) - 1
8968          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8969                            + bias_for_lowest, lowest_vf) - 1);
8970   if (loop->any_estimate)
8971     loop->nb_iterations_estimate
8972       = (final_iter_may_be_partial
8973          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8974                           assumed_vf) - 1
8975          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8976                            assumed_vf) - 1);
8977
8978   if (dump_enabled_p ())
8979     {
8980       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8981         {
8982           dump_printf_loc (MSG_NOTE, vect_location,
8983                            "LOOP VECTORIZED\n");
8984           if (loop->inner)
8985             dump_printf_loc (MSG_NOTE, vect_location,
8986                              "OUTER LOOP VECTORIZED\n");
8987           dump_printf (MSG_NOTE, "\n");
8988         }
8989       else
8990         {
8991           dump_printf_loc (MSG_NOTE, vect_location,
8992                            "LOOP EPILOGUE VECTORIZED (VS=");
8993           dump_dec (MSG_NOTE, current_vector_size);
8994           dump_printf (MSG_NOTE, ")\n");
8995         }
8996     }
8997
8998   /* Free SLP instances here because otherwise stmt reference counting
8999      won't work.  */
9000   slp_instance instance;
9001   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9002     vect_free_slp_instance (instance);
9003   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9004   /* Clear-up safelen field since its value is invalid after vectorization
9005      since vectorized loop can have loop-carried dependencies.  */
9006   loop->safelen = 0;
9007
9008   /* Don't vectorize epilogue for epilogue.  */
9009   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9010     epilogue = NULL;
9011
9012   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9013     epilogue = NULL;
9014
9015   if (epilogue)
9016     {
9017       auto_vector_sizes vector_sizes;
9018       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9019       unsigned int next_size = 0;
9020
9021       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9022           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9023           && known_eq (vf, lowest_vf))
9024         {
9025           unsigned int eiters
9026             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9027                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9028           eiters = eiters % lowest_vf;
9029           epilogue->nb_iterations_upper_bound = eiters - 1;
9030
9031           unsigned int ratio;
9032           while (next_size < vector_sizes.length ()
9033                  && !(constant_multiple_p (current_vector_size,
9034                                            vector_sizes[next_size], &ratio)
9035                       && eiters >= lowest_vf / ratio))
9036             next_size += 1;
9037         }
9038       else
9039         while (next_size < vector_sizes.length ()
9040                && maybe_lt (current_vector_size, vector_sizes[next_size]))
9041           next_size += 1;
9042
9043       if (next_size == vector_sizes.length ())
9044         epilogue = NULL;
9045     }
9046
9047   if (epilogue)
9048     {
9049       epilogue->force_vectorize = loop->force_vectorize;
9050       epilogue->safelen = loop->safelen;
9051       epilogue->dont_vectorize = false;
9052
9053       /* We may need to if-convert epilogue to vectorize it.  */
9054       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9055         tree_if_conversion (epilogue);
9056     }
9057
9058   return epilogue;
9059 }
9060
9061 /* The code below is trying to perform simple optimization - revert
9062    if-conversion for masked stores, i.e. if the mask of a store is zero
9063    do not perform it and all stored value producers also if possible.
9064    For example,
9065      for (i=0; i<n; i++)
9066        if (c[i])
9067         {
9068           p1[i] += 1;
9069           p2[i] = p3[i] +2;
9070         }
9071    this transformation will produce the following semi-hammock:
9072
9073    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9074      {
9075        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9076        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9077        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9078        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9079        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9080        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9081      }
9082 */
9083
9084 void
9085 optimize_mask_stores (struct loop *loop)
9086 {
9087   basic_block *bbs = get_loop_body (loop);
9088   unsigned nbbs = loop->num_nodes;
9089   unsigned i;
9090   basic_block bb;
9091   struct loop *bb_loop;
9092   gimple_stmt_iterator gsi;
9093   gimple *stmt;
9094   auto_vec<gimple *> worklist;
9095
9096   vect_location = find_loop_location (loop);
9097   /* Pick up all masked stores in loop if any.  */
9098   for (i = 0; i < nbbs; i++)
9099     {
9100       bb = bbs[i];
9101       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9102            gsi_next (&gsi))
9103         {
9104           stmt = gsi_stmt (gsi);
9105           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9106             worklist.safe_push (stmt);
9107         }
9108     }
9109
9110   free (bbs);
9111   if (worklist.is_empty ())
9112     return;
9113
9114   /* Loop has masked stores.  */
9115   while (!worklist.is_empty ())
9116     {
9117       gimple *last, *last_store;
9118       edge e, efalse;
9119       tree mask;
9120       basic_block store_bb, join_bb;
9121       gimple_stmt_iterator gsi_to;
9122       tree vdef, new_vdef;
9123       gphi *phi;
9124       tree vectype;
9125       tree zero;
9126
9127       last = worklist.pop ();
9128       mask = gimple_call_arg (last, 2);
9129       bb = gimple_bb (last);
9130       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9131          the same loop as if_bb.  It could be different to LOOP when two
9132          level loop-nest is vectorized and mask_store belongs to the inner
9133          one.  */
9134       e = split_block (bb, last);
9135       bb_loop = bb->loop_father;
9136       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9137       join_bb = e->dest;
9138       store_bb = create_empty_bb (bb);
9139       add_bb_to_loop (store_bb, bb_loop);
9140       e->flags = EDGE_TRUE_VALUE;
9141       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9142       /* Put STORE_BB to likely part.  */
9143       efalse->probability = profile_probability::unlikely ();
9144       store_bb->count = efalse->count ();
9145       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9146       if (dom_info_available_p (CDI_DOMINATORS))
9147         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9148       if (dump_enabled_p ())
9149         dump_printf_loc (MSG_NOTE, vect_location,
9150                          "Create new block %d to sink mask stores.",
9151                          store_bb->index);
9152       /* Create vector comparison with boolean result.  */
9153       vectype = TREE_TYPE (mask);
9154       zero = build_zero_cst (vectype);
9155       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9156       gsi = gsi_last_bb (bb);
9157       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9158       /* Create new PHI node for vdef of the last masked store:
9159          .MEM_2 = VDEF <.MEM_1>
9160          will be converted to
9161          .MEM.3 = VDEF <.MEM_1>
9162          and new PHI node will be created in join bb
9163          .MEM_2 = PHI <.MEM_1, .MEM_3>
9164       */
9165       vdef = gimple_vdef (last);
9166       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9167       gimple_set_vdef (last, new_vdef);
9168       phi = create_phi_node (vdef, join_bb);
9169       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9170
9171       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9172       while (true)
9173         {
9174           gimple_stmt_iterator gsi_from;
9175           gimple *stmt1 = NULL;
9176
9177           /* Move masked store to STORE_BB.  */
9178           last_store = last;
9179           gsi = gsi_for_stmt (last);
9180           gsi_from = gsi;
9181           /* Shift GSI to the previous stmt for further traversal.  */
9182           gsi_prev (&gsi);
9183           gsi_to = gsi_start_bb (store_bb);
9184           gsi_move_before (&gsi_from, &gsi_to);
9185           /* Setup GSI_TO to the non-empty block start.  */
9186           gsi_to = gsi_start_bb (store_bb);
9187           if (dump_enabled_p ())
9188             {
9189               dump_printf_loc (MSG_NOTE, vect_location,
9190                                "Move stmt to created bb\n");
9191               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9192             }
9193           /* Move all stored value producers if possible.  */
9194           while (!gsi_end_p (gsi))
9195             {
9196               tree lhs;
9197               imm_use_iterator imm_iter;
9198               use_operand_p use_p;
9199               bool res;
9200
9201               /* Skip debug statements.  */
9202               if (is_gimple_debug (gsi_stmt (gsi)))
9203                 {
9204                   gsi_prev (&gsi);
9205                   continue;
9206                 }
9207               stmt1 = gsi_stmt (gsi);
9208               /* Do not consider statements writing to memory or having
9209                  volatile operand.  */
9210               if (gimple_vdef (stmt1)
9211                   || gimple_has_volatile_ops (stmt1))
9212                 break;
9213               gsi_from = gsi;
9214               gsi_prev (&gsi);
9215               lhs = gimple_get_lhs (stmt1);
9216               if (!lhs)
9217                 break;
9218
9219               /* LHS of vectorized stmt must be SSA_NAME.  */
9220               if (TREE_CODE (lhs) != SSA_NAME)
9221                 break;
9222
9223               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9224                 {
9225                   /* Remove dead scalar statement.  */
9226                   if (has_zero_uses (lhs))
9227                     {
9228                       gsi_remove (&gsi_from, true);
9229                       continue;
9230                     }
9231                 }
9232
9233               /* Check that LHS does not have uses outside of STORE_BB.  */
9234               res = true;
9235               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9236                 {
9237                   gimple *use_stmt;
9238                   use_stmt = USE_STMT (use_p);
9239                   if (is_gimple_debug (use_stmt))
9240                     continue;
9241                   if (gimple_bb (use_stmt) != store_bb)
9242                     {
9243                       res = false;
9244                       break;
9245                     }
9246                 }
9247               if (!res)
9248                 break;
9249
9250               if (gimple_vuse (stmt1)
9251                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9252                 break;
9253
9254               /* Can move STMT1 to STORE_BB.  */
9255               if (dump_enabled_p ())
9256                 {
9257                   dump_printf_loc (MSG_NOTE, vect_location,
9258                                    "Move stmt to created bb\n");
9259                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9260                 }
9261               gsi_move_before (&gsi_from, &gsi_to);
9262               /* Shift GSI_TO for further insertion.  */
9263               gsi_prev (&gsi_to);
9264             }
9265           /* Put other masked stores with the same mask to STORE_BB.  */
9266           if (worklist.is_empty ()
9267               || gimple_call_arg (worklist.last (), 2) != mask
9268               || worklist.last () != stmt1)
9269             break;
9270           last = worklist.pop ();
9271         }
9272       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9273     }
9274 }