gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156
 157 /* Function vect_determine_vectorization_factor
 158
 159    Determine the vectorization factor (VF).  VF is the number of data elements
 160    that are operated upon in parallel in a single iteration of the vectorized
 161    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 162    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 163    elements can fit in a single vector register.
 164
 165    We currently support vectorization of loops in which all types operated upon
 166    are of the same size.  Therefore this function currently sets VF according to
 167    the size of the types operated upon, and fails if there are multiple sizes
 168    in the loop.
 169
 170    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 171    original loop:
 172         for (i=0; i<N; i++){
 173           a[i] = b[i] + c[i];
 174         }
 175
 176    vectorized loop:
 177         for (i=0; i<N; i+=VF){
 178           a[i:VF] = b[i:VF] + c[i:VF];
 179         }
 180 */
 181
 182 static bool
 183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 187   unsigned nbbs = loop->num_nodes;
 188   poly_uint64 vectorization_factor = 1;
 189   tree scalar_type = NULL_TREE;
 190   gphi *phi;
 191   tree vectype;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               if (dump_enabled_p ())
 261                 {
 262                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 263                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 264                   dump_printf (MSG_NOTE, "\n");
 265                 }
 266
 267               vect_update_max_nunits (&vectorization_factor, vectype);
 268             }
 269         }
 270
 271       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 272            !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288             }
 289
 290           gcc_assert (stmt_info);
 291
 292           /* Skip stmts which do not need to be vectorized.  */
 293           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 294                && !STMT_VINFO_LIVE_P (stmt_info))
 295               || gimple_clobber_p (stmt))
 296             {
 297               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 298                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 299                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 300                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 301                 {
 302                   stmt = pattern_stmt;
 303                   stmt_info = vinfo_for_stmt (pattern_stmt);
 304                   if (dump_enabled_p ())
 305                     {
 306                       dump_printf_loc (MSG_NOTE, vect_location,
 307                                        "==> examining pattern statement: ");
 308                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 309                     }
 310                 }
 311               else
 312                 {
 313                   if (dump_enabled_p ())
 314                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 315                   gsi_next (&si);
 316                   continue;
 317                 }
 318             }
 319           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 320                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 321                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 322                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 323             analyze_pattern_stmt = true;
 324
 325           /* If a pattern statement has def stmts, analyze them too.  */
 326           if (is_pattern_stmt_p (stmt_info))
 327             {
 328               if (pattern_def_seq == NULL)
 329                 {
 330                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 331                   pattern_def_si = gsi_start (pattern_def_seq);
 332                 }
 333               else if (!gsi_end_p (pattern_def_si))
 334                 gsi_next (&pattern_def_si);
 335               if (pattern_def_seq != NULL)
 336                 {
 337                   gimple *pattern_def_stmt = NULL;
 338                   stmt_vec_info pattern_def_stmt_info = NULL;
 339
 340                   while (!gsi_end_p (pattern_def_si))
 341                     {
 342                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 343                       pattern_def_stmt_info
 344                         = vinfo_for_stmt (pattern_def_stmt);
 345                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 346                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 347                         break;
 348                       gsi_next (&pattern_def_si);
 349                     }
 350
 351                   if (!gsi_end_p (pattern_def_si))
 352                     {
 353                       if (dump_enabled_p ())
 354                         {
 355                           dump_printf_loc (MSG_NOTE, vect_location,
 356                                            "==> examining pattern def stmt: ");
 357                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 358                                             pattern_def_stmt, 0);
 359                         }
 360
 361                       stmt = pattern_def_stmt;
 362                       stmt_info = pattern_def_stmt_info;
 363                     }
 364                   else
 365                     {
 366                       pattern_def_si = gsi_none ();
 367                       analyze_pattern_stmt = false;
 368                     }
 369                 }
 370               else
 371                 analyze_pattern_stmt = false;
 372             }
 373
 374           if (gimple_get_lhs (stmt) == NULL_TREE
 375               /* MASK_STORE has no lhs, but is ok.  */
 376               && (!is_gimple_call (stmt)
 377                   || !gimple_call_internal_p (stmt)
 378                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 379             {
 380               if (is_gimple_call (stmt))
 381                 {
 382                   /* Ignore calls with no lhs.  These must be calls to
 383                      #pragma omp simd functions, and what vectorization factor
 384                      it really needs can't be determined until
 385                      vectorizable_simd_clone_call.  */
 386                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 387                     {
 388                       pattern_def_seq = NULL;
 389                       gsi_next (&si);
 390                     }
 391                   continue;
 392                 }
 393               if (dump_enabled_p ())
 394                 {
 395                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 396                                    "not vectorized: irregular stmt.");
 397                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 398                                     0);
 399                 }
 400               return false;
 401             }
 402
 403           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 404             {
 405               if (dump_enabled_p ())
 406                 {
 407                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                                    "not vectorized: vector stmt in loop:");
 409                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 431                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 432               else
 433                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 434
 435               /* Bool ops don't participate in vectorization factor
 436                  computation.  For comparison use compared types to
 437                  compute a factor.  */
 438               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 439                   && is_gimple_assign (stmt)
 440                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 441                 {
 442                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 443                       || STMT_VINFO_LIVE_P (stmt_info))
 444                     mask_producers.safe_push (stmt_info);
 445                   bool_result = true;
 446
 447                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 448                       == tcc_comparison
 449                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 450                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 451                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 452                   else
 453                     {
 454                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 455                         {
 456                           pattern_def_seq = NULL;
 457                           gsi_next (&si);
 458                         }
 459                       continue;
 460                     }
 461                 }
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location,
 466                                    "get vectype for scalar type:  ");
 467                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 468                   dump_printf (MSG_NOTE, "\n");
 469                 }
 470               vectype = get_vectype_for_scalar_type (scalar_type);
 471               if (!vectype)
 472                 {
 473                   if (dump_enabled_p ())
 474                     {
 475                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 476                                        "not vectorized: unsupported "
 477                                        "data-type ");
 478                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 479                                          scalar_type);
 480                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 481                     }
 482                   return false;
 483                 }
 484
 485               if (!bool_result)
 486                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 487
 488               if (dump_enabled_p ())
 489                 {
 490                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 491                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 492                   dump_printf (MSG_NOTE, "\n");
 493                 }
 494             }
 495
 496           /* Don't try to compute VF out scalar types if we stmt
 497              produces boolean vector.  Use result vectype instead.  */
 498           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 499             vf_vectype = vectype;
 500           else
 501             {
 502               /* The vectorization factor is according to the smallest
 503                  scalar type (or the largest vector size, but we only
 504                  support one vector size per loop).  */
 505               if (!bool_result)
 506                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                              &dummy);
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_NOTE, vect_location,
 511                                    "get vectype for scalar type:  ");
 512                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513                   dump_printf (MSG_NOTE, "\n");
 514                 }
 515               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516             }
 517           if (!vf_vectype)
 518             {
 519               if (dump_enabled_p ())
 520                 {
 521                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 522                                    "not vectorized: unsupported data-type ");
 523                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 524                                      scalar_type);
 525                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 526                 }
 527               return false;
 528             }
 529
 530           if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
 531                         GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 532             {
 533               if (dump_enabled_p ())
 534                 {
 535                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                                    "not vectorized: different sized vector "
 537                                    "types in statement, ");
 538                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 539                                      vectype);
 540                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 541                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 542                                      vf_vectype);
 543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 544                 }
 545               return false;
 546             }
 547
 548           if (dump_enabled_p ())
 549             {
 550               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 551               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 552               dump_printf (MSG_NOTE, "\n");
 553             }
 554
 555           if (dump_enabled_p ())
 556             {
 557               dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 558               dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
 559               dump_printf (MSG_NOTE, "\n");
 560             }
 561
 562           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 563
 564           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 565             {
 566               pattern_def_seq = NULL;
 567               gsi_next (&si);
 568             }
 569         }
 570     }
 571
 572   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 573   if (dump_enabled_p ())
 574     {
 575       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 576       dump_dec (MSG_NOTE, vectorization_factor);
 577       dump_printf (MSG_NOTE, "\n");
 578     }
 579
 580   if (known_le (vectorization_factor, 1U))
 581     {
 582       if (dump_enabled_p ())
 583         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 584                          "not vectorized: unsupported data-type\n");
 585       return false;
 586     }
 587   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 588
 589   for (i = 0; i < mask_producers.length (); i++)
 590     {
 591       tree mask_type = NULL;
 592
 593       stmt = STMT_VINFO_STMT (mask_producers[i]);
 594
 595       if (is_gimple_assign (stmt)
 596           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 597           && !VECT_SCALAR_BOOLEAN_TYPE_P
 598                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 599         {
 600           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 601           mask_type = get_mask_type_for_scalar_type (scalar_type);
 602
 603           if (!mask_type)
 604             {
 605               if (dump_enabled_p ())
 606                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                                  "not vectorized: unsupported mask\n");
 608               return false;
 609             }
 610         }
 611       else
 612         {
 613           tree rhs;
 614           ssa_op_iter iter;
 615           gimple *def_stmt;
 616           enum vect_def_type dt;
 617
 618           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 619             {
 620               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 621                                        &def_stmt, &dt, &vectype))
 622                 {
 623                   if (dump_enabled_p ())
 624                     {
 625                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 626                                        "not vectorized: can't compute mask type "
 627                                        "for statement, ");
 628                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 629                                         0);
 630                     }
 631                   return false;
 632                 }
 633
 634               /* No vectype probably means external definition.
 635                  Allow it in case there is another operand which
 636                  allows to determine mask type.  */
 637               if (!vectype)
 638                 continue;
 639
 640               if (!mask_type)
 641                 mask_type = vectype;
 642               else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
 643                                  TYPE_VECTOR_SUBPARTS (vectype)))
 644                 {
 645                   if (dump_enabled_p ())
 646                     {
 647                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 648                                        "not vectorized: different sized masks "
 649                                        "types in statement, ");
 650                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 651                                          mask_type);
 652                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 653                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 654                                          vectype);
 655                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 656                     }
 657                   return false;
 658                 }
 659               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 660                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 661                 {
 662                   if (dump_enabled_p ())
 663                     {
 664                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 665                                        "not vectorized: mixed mask and "
 666                                        "nonmask vector types in statement, ");
 667                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 668                                          mask_type);
 669                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 670                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 671                                          vectype);
 672                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 673                     }
 674                   return false;
 675                 }
 676             }
 677
 678           /* We may compare boolean value loaded as vector of integers.
 679              Fix mask_type in such case.  */
 680           if (mask_type
 681               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 682               && gimple_code (stmt) == GIMPLE_ASSIGN
 683               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 684             mask_type = build_same_sized_truth_vector_type (mask_type);
 685         }
 686
 687       /* No mask_type should mean loop invariant predicate.
 688          This is probably a subject for optimization in
 689          if-conversion.  */
 690       if (!mask_type)
 691         {
 692           if (dump_enabled_p ())
 693             {
 694               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 695                                "not vectorized: can't compute mask type "
 696                                "for statement, ");
 697               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 698                                 0);
 699             }
 700           return false;
 701         }
 702
 703       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 704     }
 705
 706   return true;
 707 }
 708
 709
 710 /* Function vect_is_simple_iv_evolution.
 711
 712    FORNOW: A simple evolution of an induction variables in the loop is
 713    considered a polynomial evolution.  */
 714
 715 static bool
 716 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 717                              tree * step)
 718 {
 719   tree init_expr;
 720   tree step_expr;
 721   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 722   basic_block bb;
 723
 724   /* When there is no evolution in this loop, the evolution function
 725      is not "simple".  */
 726   if (evolution_part == NULL_TREE)
 727     return false;
 728
 729   /* When the evolution is a polynomial of degree >= 2
 730      the evolution function is not "simple".  */
 731   if (tree_is_chrec (evolution_part))
 732     return false;
 733
 734   step_expr = evolution_part;
 735   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 736
 737   if (dump_enabled_p ())
 738     {
 739       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 740       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 741       dump_printf (MSG_NOTE, ",  init: ");
 742       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 743       dump_printf (MSG_NOTE, "\n");
 744     }
 745
 746   *init = init_expr;
 747   *step = step_expr;
 748
 749   if (TREE_CODE (step_expr) != INTEGER_CST
 750       && (TREE_CODE (step_expr) != SSA_NAME
 751           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 752               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 753           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 754               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 755                   || !flag_associative_math)))
 756       && (TREE_CODE (step_expr) != REAL_CST
 757           || !flag_associative_math))
 758     {
 759       if (dump_enabled_p ())
 760         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 761                          "step unknown.\n");
 762       return false;
 763     }
 764
 765   return true;
 766 }
 767
 768 /* Function vect_analyze_scalar_cycles_1.
 769
 770    Examine the cross iteration def-use cycles of scalar variables
 771    in LOOP.  LOOP_VINFO represents the loop that is now being
 772    considered for vectorization (can be LOOP, or an outer-loop
 773    enclosing LOOP).  */
 774
 775 static void
 776 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 777 {
 778   basic_block bb = loop->header;
 779   tree init, step;
 780   auto_vec<gimple *, 64> worklist;
 781   gphi_iterator gsi;
 782   bool double_reduc;
 783
 784   if (dump_enabled_p ())
 785     dump_printf_loc (MSG_NOTE, vect_location,
 786                      "=== vect_analyze_scalar_cycles ===\n");
 787
 788   /* First - identify all inductions.  Reduction detection assumes that all the
 789      inductions have been identified, therefore, this order must not be
 790      changed.  */
 791   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 792     {
 793       gphi *phi = gsi.phi ();
 794       tree access_fn = NULL;
 795       tree def = PHI_RESULT (phi);
 796       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 797
 798       if (dump_enabled_p ())
 799         {
 800           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 801           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 802         }
 803
 804       /* Skip virtual phi's.  The data dependences that are associated with
 805          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 806       if (virtual_operand_p (def))
 807         continue;
 808
 809       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 810
 811       /* Analyze the evolution function.  */
 812       access_fn = analyze_scalar_evolution (loop, def);
 813       if (access_fn)
 814         {
 815           STRIP_NOPS (access_fn);
 816           if (dump_enabled_p ())
 817             {
 818               dump_printf_loc (MSG_NOTE, vect_location,
 819                                "Access function of PHI: ");
 820               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 821               dump_printf (MSG_NOTE, "\n");
 822             }
 823           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 824             = initial_condition_in_loop_num (access_fn, loop->num);
 825           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 826             = evolution_part_in_loop_num (access_fn, loop->num);
 827         }
 828
 829       if (!access_fn
 830           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 831           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 832               && TREE_CODE (step) != INTEGER_CST))
 833         {
 834           worklist.safe_push (phi);
 835           continue;
 836         }
 837
 838       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 839                   != NULL_TREE);
 840       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 841
 842       if (dump_enabled_p ())
 843         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 844       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 845     }
 846
 847
 848   /* Second - identify all reductions and nested cycles.  */
 849   while (worklist.length () > 0)
 850     {
 851       gimple *phi = worklist.pop ();
 852       tree def = PHI_RESULT (phi);
 853       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 854       gimple *reduc_stmt;
 855
 856       if (dump_enabled_p ())
 857         {
 858           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 859           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 860         }
 861
 862       gcc_assert (!virtual_operand_p (def)
 863                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 864
 865       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 866                                                 &double_reduc, false);
 867       if (reduc_stmt)
 868         {
 869           if (double_reduc)
 870             {
 871               if (dump_enabled_p ())
 872                 dump_printf_loc (MSG_NOTE, vect_location,
 873                                  "Detected double reduction.\n");
 874
 875               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 876               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 877                                                     vect_double_reduction_def;
 878             }
 879           else
 880             {
 881               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 882                 {
 883                   if (dump_enabled_p ())
 884                     dump_printf_loc (MSG_NOTE, vect_location,
 885                                      "Detected vectorizable nested cycle.\n");
 886
 887                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 888                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 889                                                              vect_nested_cycle;
 890                 }
 891               else
 892                 {
 893                   if (dump_enabled_p ())
 894                     dump_printf_loc (MSG_NOTE, vect_location,
 895                                      "Detected reduction.\n");
 896
 897                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 898                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 899                                                            vect_reduction_def;
 900                   /* Store the reduction cycles for possible vectorization in
 901                      loop-aware SLP if it was not detected as reduction
 902                      chain.  */
 903                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 904                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 905                 }
 906             }
 907         }
 908       else
 909         if (dump_enabled_p ())
 910           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 911                            "Unknown def-use cycle pattern.\n");
 912     }
 913 }
 914
 915
 916 /* Function vect_analyze_scalar_cycles.
 917
 918    Examine the cross iteration def-use cycles of scalar variables, by
 919    analyzing the loop-header PHIs of scalar variables.  Classify each
 920    cycle as one of the following: invariant, induction, reduction, unknown.
 921    We do that for the loop represented by LOOP_VINFO, and also to its
 922    inner-loop, if exists.
 923    Examples for scalar cycles:
 924
 925    Example1: reduction:
 926
 927               loop1:
 928               for (i=0; i<N; i++)
 929                  sum += a[i];
 930
 931    Example2: induction:
 932
 933               loop2:
 934               for (i=0; i<N; i++)
 935                  a[i] = i;  */
 936
 937 static void
 938 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 939 {
 940   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 941
 942   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 943
 944   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 945      Reductions in such inner-loop therefore have different properties than
 946      the reductions in the nest that gets vectorized:
 947      1. When vectorized, they are executed in the same order as in the original
 948         scalar loop, so we can't change the order of computation when
 949         vectorizing them.
 950      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 951         current checks are too strict.  */
 952
 953   if (loop->inner)
 954     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 955 }
 956
 957 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 958
 959 static void
 960 vect_fixup_reduc_chain (gimple *stmt)
 961 {
 962   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 963   gimple *stmtp;
 964   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 965               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 966   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 967   do
 968     {
 969       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 970       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 971       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 972       if (stmt)
 973         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 974           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 975     }
 976   while (stmt);
 977   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 978 }
 979
 980 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 981
 982 static void
 983 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 984 {
 985   gimple *first;
 986   unsigned i;
 987
 988   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 989     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 990       {
 991         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 992         while (next)
 993           {
 994             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 995               break;
 996             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 997           }
 998         /* If not all stmt in the chain are patterns try to handle
 999            the chain without patterns.  */
1000         if (! next)
1001           {
1002             vect_fixup_reduc_chain (first);
1003             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1004               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1005           }
1006       }
1007 }
1008
1009 /* Function vect_get_loop_niters.
1010
1011    Determine how many iterations the loop is executed and place it
1012    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1013    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1014    niter information holds in ASSUMPTIONS.
1015
1016    Return the loop exit condition.  */
1017
1018
1019 static gcond *
1020 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1021                       tree *number_of_iterations, tree *number_of_iterationsm1)
1022 {
1023   edge exit = single_exit (loop);
1024   struct tree_niter_desc niter_desc;
1025   tree niter_assumptions, niter, may_be_zero;
1026   gcond *cond = get_loop_exit_condition (loop);
1027
1028   *assumptions = boolean_true_node;
1029   *number_of_iterationsm1 = chrec_dont_know;
1030   *number_of_iterations = chrec_dont_know;
1031   if (dump_enabled_p ())
1032     dump_printf_loc (MSG_NOTE, vect_location,
1033                      "=== get_loop_niters ===\n");
1034
1035   if (!exit)
1036     return cond;
1037
1038   niter = chrec_dont_know;
1039   may_be_zero = NULL_TREE;
1040   niter_assumptions = boolean_true_node;
1041   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1042       || chrec_contains_undetermined (niter_desc.niter))
1043     return cond;
1044
1045   niter_assumptions = niter_desc.assumptions;
1046   may_be_zero = niter_desc.may_be_zero;
1047   niter = niter_desc.niter;
1048
1049   if (may_be_zero && integer_zerop (may_be_zero))
1050     may_be_zero = NULL_TREE;
1051
1052   if (may_be_zero)
1053     {
1054       if (COMPARISON_CLASS_P (may_be_zero))
1055         {
1056           /* Try to combine may_be_zero with assumptions, this can simplify
1057              computation of niter expression.  */
1058           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1059             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1060                                              niter_assumptions,
1061                                              fold_build1 (TRUTH_NOT_EXPR,
1062                                                           boolean_type_node,
1063                                                           may_be_zero));
1064           else
1065             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1066                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1067
1068           may_be_zero = NULL_TREE;
1069         }
1070       else if (integer_nonzerop (may_be_zero))
1071         {
1072           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1073           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1074           return cond;
1075         }
1076       else
1077         return cond;
1078     }
1079
1080   *assumptions = niter_assumptions;
1081   *number_of_iterationsm1 = niter;
1082
1083   /* We want the number of loop header executions which is the number
1084      of latch executions plus one.
1085      ???  For UINT_MAX latch executions this number overflows to zero
1086      for loops like do { n++; } while (n != 0);  */
1087   if (niter && !chrec_contains_undetermined (niter))
1088     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1089                           build_int_cst (TREE_TYPE (niter), 1));
1090   *number_of_iterations = niter;
1091
1092   return cond;
1093 }
1094
1095 /* Function bb_in_loop_p
1096
1097    Used as predicate for dfs order traversal of the loop bbs.  */
1098
1099 static bool
1100 bb_in_loop_p (const_basic_block bb, const void *data)
1101 {
1102   const struct loop *const loop = (const struct loop *)data;
1103   if (flow_bb_inside_loop_p (loop, bb))
1104     return true;
1105   return false;
1106 }
1107
1108
1109 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1110    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1111
1112 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1113   : vec_info (vec_info::loop, init_cost (loop_in)),
1114     loop (loop_in),
1115     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1116     num_itersm1 (NULL_TREE),
1117     num_iters (NULL_TREE),
1118     num_iters_unchanged (NULL_TREE),
1119     num_iters_assumptions (NULL_TREE),
1120     th (0),
1121     versioning_threshold (0),
1122     vectorization_factor (0),
1123     max_vectorization_factor (0),
1124     mask_skip_niters (NULL_TREE),
1125     mask_compare_type (NULL_TREE),
1126     unaligned_dr (NULL),
1127     peeling_for_alignment (0),
1128     ptr_mask (0),
1129     slp_unrolling_factor (1),
1130     single_scalar_iteration_cost (0),
1131     vectorizable (false),
1132     can_fully_mask_p (true),
1133     fully_masked_p (false),
1134     peeling_for_gaps (false),
1135     peeling_for_niter (false),
1136     operands_swapped (false),
1137     no_data_dependencies (false),
1138     has_mask_store (false),
1139     scalar_loop (NULL),
1140     orig_loop_info (NULL)
1141 {
1142   /* Create/Update stmt_info for all stmts in the loop.  */
1143   basic_block *body = get_loop_body (loop);
1144   for (unsigned int i = 0; i < loop->num_nodes; i++)
1145     {
1146       basic_block bb = body[i];
1147       gimple_stmt_iterator si;
1148
1149       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1150         {
1151           gimple *phi = gsi_stmt (si);
1152           gimple_set_uid (phi, 0);
1153           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1154         }
1155
1156       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1157         {
1158           gimple *stmt = gsi_stmt (si);
1159           gimple_set_uid (stmt, 0);
1160           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1161         }
1162     }
1163   free (body);
1164
1165   /* CHECKME: We want to visit all BBs before their successors (except for
1166      latch blocks, for which this assertion wouldn't hold).  In the simple
1167      case of the loop forms we allow, a dfs order of the BBs would the same
1168      as reversed postorder traversal, so we are safe.  */
1169
1170   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1171                                           bbs, loop->num_nodes, loop);
1172   gcc_assert (nbbs == loop->num_nodes);
1173 }
1174
1175 /* Free all levels of MASKS.  */
1176
1177 void
1178 release_vec_loop_masks (vec_loop_masks *masks)
1179 {
1180   rgroup_masks *rgm;
1181   unsigned int i;
1182   FOR_EACH_VEC_ELT (*masks, i, rgm)
1183     rgm->masks.release ();
1184   masks->release ();
1185 }
1186
1187 /* Free all memory used by the _loop_vec_info, as well as all the
1188    stmt_vec_info structs of all the stmts in the loop.  */
1189
1190 _loop_vec_info::~_loop_vec_info ()
1191 {
1192   int nbbs;
1193   gimple_stmt_iterator si;
1194   int j;
1195
1196   nbbs = loop->num_nodes;
1197   for (j = 0; j < nbbs; j++)
1198     {
1199       basic_block bb = bbs[j];
1200       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1201         free_stmt_vec_info (gsi_stmt (si));
1202
1203       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1204         {
1205           gimple *stmt = gsi_stmt (si);
1206
1207           /* We may have broken canonical form by moving a constant
1208              into RHS1 of a commutative op.  Fix such occurrences.  */
1209           if (operands_swapped && is_gimple_assign (stmt))
1210             {
1211               enum tree_code code = gimple_assign_rhs_code (stmt);
1212
1213               if ((code == PLUS_EXPR
1214                    || code == POINTER_PLUS_EXPR
1215                    || code == MULT_EXPR)
1216                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1217                 swap_ssa_operands (stmt,
1218                                    gimple_assign_rhs1_ptr (stmt),
1219                                    gimple_assign_rhs2_ptr (stmt));
1220               else if (code == COND_EXPR
1221                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1222                 {
1223                   tree cond_expr = gimple_assign_rhs1 (stmt);
1224                   enum tree_code cond_code = TREE_CODE (cond_expr);
1225
1226                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1227                     {
1228                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1229                                                                   0));
1230                       cond_code = invert_tree_comparison (cond_code,
1231                                                           honor_nans);
1232                       if (cond_code != ERROR_MARK)
1233                         {
1234                           TREE_SET_CODE (cond_expr, cond_code);
1235                           swap_ssa_operands (stmt,
1236                                              gimple_assign_rhs2_ptr (stmt),
1237                                              gimple_assign_rhs3_ptr (stmt));
1238                         }
1239                     }
1240                 }
1241             }
1242
1243           /* Free stmt_vec_info.  */
1244           free_stmt_vec_info (stmt);
1245           gsi_next (&si);
1246         }
1247     }
1248
1249   free (bbs);
1250
1251   release_vec_loop_masks (&masks);
1252
1253   loop->aux = NULL;
1254 }
1255
1256 /* Return true if we can use CMP_TYPE as the comparison type to produce
1257    all masks required to mask LOOP_VINFO.  */
1258
1259 static bool
1260 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1261 {
1262   rgroup_masks *rgm;
1263   unsigned int i;
1264   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1265     if (rgm->mask_type != NULL_TREE
1266         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1267                                             cmp_type, rgm->mask_type,
1268                                             OPTIMIZE_FOR_SPEED))
1269       return false;
1270   return true;
1271 }
1272
1273 /* Calculate the maximum number of scalars per iteration for every
1274    rgroup in LOOP_VINFO.  */
1275
1276 static unsigned int
1277 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1278 {
1279   unsigned int res = 1;
1280   unsigned int i;
1281   rgroup_masks *rgm;
1282   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1283     res = MAX (res, rgm->max_nscalars_per_iter);
1284   return res;
1285 }
1286
1287 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1288    whether we can actually generate the masks required.  Return true if so,
1289    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1290
1291 static bool
1292 vect_verify_full_masking (loop_vec_info loop_vinfo)
1293 {
1294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1295   unsigned int min_ni_width;
1296
1297   /* Get the maximum number of iterations that is representable
1298      in the counter type.  */
1299   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1300   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1301
1302   /* Get a more refined estimate for the number of iterations.  */
1303   widest_int max_back_edges;
1304   if (max_loop_iterations (loop, &max_back_edges))
1305     max_ni = wi::smin (max_ni, max_back_edges + 1);
1306
1307   /* Account for rgroup masks, in which each bit is replicated N times.  */
1308   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1309
1310   /* Work out how many bits we need to represent the limit.  */
1311   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1312
1313   /* Find a scalar mode for which WHILE_ULT is supported.  */
1314   opt_scalar_int_mode cmp_mode_iter;
1315   tree cmp_type = NULL_TREE;
1316   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1317     {
1318       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1319       if (cmp_bits >= min_ni_width
1320           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1321         {
1322           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1323           if (this_type
1324               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1325             {
1326               /* Although we could stop as soon as we find a valid mode,
1327                  it's often better to continue until we hit Pmode, since the
1328                  operands to the WHILE are more likely to be reusable in
1329                  address calculations.  */
1330               cmp_type = this_type;
1331               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1332                 break;
1333             }
1334         }
1335     }
1336
1337   if (!cmp_type)
1338     return false;
1339
1340   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1341   return true;
1342 }
1343
1344 /* Calculate the cost of one scalar iteration of the loop.  */
1345 static void
1346 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1347 {
1348   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1349   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1350   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1351   int innerloop_iters, i;
1352
1353   /* Count statements in scalar loop.  Using this as scalar cost for a single
1354      iteration for now.
1355
1356      TODO: Add outer loop support.
1357
1358      TODO: Consider assigning different costs to different scalar
1359      statements.  */
1360
1361   /* FORNOW.  */
1362   innerloop_iters = 1;
1363   if (loop->inner)
1364     innerloop_iters = 50; /* FIXME */
1365
1366   for (i = 0; i < nbbs; i++)
1367     {
1368       gimple_stmt_iterator si;
1369       basic_block bb = bbs[i];
1370
1371       if (bb->loop_father == loop->inner)
1372         factor = innerloop_iters;
1373       else
1374         factor = 1;
1375
1376       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1377         {
1378           gimple *stmt = gsi_stmt (si);
1379           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1380
1381           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1382             continue;
1383
1384           /* Skip stmts that are not vectorized inside the loop.  */
1385           if (stmt_info
1386               && !STMT_VINFO_RELEVANT_P (stmt_info)
1387               && (!STMT_VINFO_LIVE_P (stmt_info)
1388                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1389               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1390             continue;
1391
1392           vect_cost_for_stmt kind;
1393           if (STMT_VINFO_DATA_REF (stmt_info))
1394             {
1395               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1396                kind = scalar_load;
1397              else
1398                kind = scalar_store;
1399             }
1400           else
1401             kind = scalar_stmt;
1402
1403           scalar_single_iter_cost
1404             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1405                                  factor, kind, stmt_info, 0, vect_prologue);
1406         }
1407     }
1408   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1409     = scalar_single_iter_cost;
1410 }
1411
1412
1413 /* Function vect_analyze_loop_form_1.
1414
1415    Verify that certain CFG restrictions hold, including:
1416    - the loop has a pre-header
1417    - the loop has a single entry and exit
1418    - the loop exit condition is simple enough
1419    - the number of iterations can be analyzed, i.e, a countable loop.  The
1420      niter could be analyzed under some assumptions.  */
1421
1422 bool
1423 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1424                           tree *assumptions, tree *number_of_iterationsm1,
1425                           tree *number_of_iterations, gcond **inner_loop_cond)
1426 {
1427   if (dump_enabled_p ())
1428     dump_printf_loc (MSG_NOTE, vect_location,
1429                      "=== vect_analyze_loop_form ===\n");
1430
1431   /* Different restrictions apply when we are considering an inner-most loop,
1432      vs. an outer (nested) loop.
1433      (FORNOW. May want to relax some of these restrictions in the future).  */
1434
1435   if (!loop->inner)
1436     {
1437       /* Inner-most loop.  We currently require that the number of BBs is
1438          exactly 2 (the header and latch).  Vectorizable inner-most loops
1439          look like this:
1440
1441                         (pre-header)
1442                            |
1443                           header <--------+
1444                            | |            |
1445                            | +--> latch --+
1446                            |
1447                         (exit-bb)  */
1448
1449       if (loop->num_nodes != 2)
1450         {
1451           if (dump_enabled_p ())
1452             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1453                              "not vectorized: control flow in loop.\n");
1454           return false;
1455         }
1456
1457       if (empty_block_p (loop->header))
1458         {
1459           if (dump_enabled_p ())
1460             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461                              "not vectorized: empty loop.\n");
1462           return false;
1463         }
1464     }
1465   else
1466     {
1467       struct loop *innerloop = loop->inner;
1468       edge entryedge;
1469
1470       /* Nested loop. We currently require that the loop is doubly-nested,
1471          contains a single inner loop, and the number of BBs is exactly 5.
1472          Vectorizable outer-loops look like this:
1473
1474                         (pre-header)
1475                            |
1476                           header <---+
1477                            |         |
1478                           inner-loop |
1479                            |         |
1480                           tail ------+
1481                            |
1482                         (exit-bb)
1483
1484          The inner-loop has the properties expected of inner-most loops
1485          as described above.  */
1486
1487       if ((loop->inner)->inner || (loop->inner)->next)
1488         {
1489           if (dump_enabled_p ())
1490             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1491                              "not vectorized: multiple nested loops.\n");
1492           return false;
1493         }
1494
1495       if (loop->num_nodes != 5)
1496         {
1497           if (dump_enabled_p ())
1498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499                              "not vectorized: control flow in loop.\n");
1500           return false;
1501         }
1502
1503       entryedge = loop_preheader_edge (innerloop);
1504       if (entryedge->src != loop->header
1505           || !single_exit (innerloop)
1506           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1507         {
1508           if (dump_enabled_p ())
1509             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1510                              "not vectorized: unsupported outerloop form.\n");
1511           return false;
1512         }
1513
1514       /* Analyze the inner-loop.  */
1515       tree inner_niterm1, inner_niter, inner_assumptions;
1516       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1517                                       &inner_assumptions, &inner_niterm1,
1518                                       &inner_niter, NULL)
1519           /* Don't support analyzing niter under assumptions for inner
1520              loop.  */
1521           || !integer_onep (inner_assumptions))
1522         {
1523           if (dump_enabled_p ())
1524             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1525                              "not vectorized: Bad inner loop.\n");
1526           return false;
1527         }
1528
1529       if (!expr_invariant_in_loop_p (loop, inner_niter))
1530         {
1531           if (dump_enabled_p ())
1532             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                              "not vectorized: inner-loop count not"
1534                              " invariant.\n");
1535           return false;
1536         }
1537
1538       if (dump_enabled_p ())
1539         dump_printf_loc (MSG_NOTE, vect_location,
1540                          "Considering outer-loop vectorization.\n");
1541     }
1542
1543   if (!single_exit (loop)
1544       || EDGE_COUNT (loop->header->preds) != 2)
1545     {
1546       if (dump_enabled_p ())
1547         {
1548           if (!single_exit (loop))
1549             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1550                              "not vectorized: multiple exits.\n");
1551           else if (EDGE_COUNT (loop->header->preds) != 2)
1552             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553                              "not vectorized: too many incoming edges.\n");
1554         }
1555       return false;
1556     }
1557
1558   /* We assume that the loop exit condition is at the end of the loop. i.e,
1559      that the loop is represented as a do-while (with a proper if-guard
1560      before the loop if needed), where the loop header contains all the
1561      executable statements, and the latch is empty.  */
1562   if (!empty_block_p (loop->latch)
1563       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1564     {
1565       if (dump_enabled_p ())
1566         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1567                          "not vectorized: latch block not empty.\n");
1568       return false;
1569     }
1570
1571   /* Make sure the exit is not abnormal.  */
1572   edge e = single_exit (loop);
1573   if (e->flags & EDGE_ABNORMAL)
1574     {
1575       if (dump_enabled_p ())
1576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577                          "not vectorized: abnormal loop exit edge.\n");
1578       return false;
1579     }
1580
1581   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1582                                      number_of_iterationsm1);
1583   if (!*loop_cond)
1584     {
1585       if (dump_enabled_p ())
1586         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1587                          "not vectorized: complicated exit condition.\n");
1588       return false;
1589     }
1590
1591   if (integer_zerop (*assumptions)
1592       || !*number_of_iterations
1593       || chrec_contains_undetermined (*number_of_iterations))
1594     {
1595       if (dump_enabled_p ())
1596         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1597                          "not vectorized: number of iterations cannot be "
1598                          "computed.\n");
1599       return false;
1600     }
1601
1602   if (integer_zerop (*number_of_iterations))
1603     {
1604       if (dump_enabled_p ())
1605         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606                          "not vectorized: number of iterations = 0.\n");
1607       return false;
1608     }
1609
1610   return true;
1611 }
1612
1613 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1614
1615 loop_vec_info
1616 vect_analyze_loop_form (struct loop *loop)
1617 {
1618   tree assumptions, number_of_iterations, number_of_iterationsm1;
1619   gcond *loop_cond, *inner_loop_cond = NULL;
1620
1621   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1622                                   &assumptions, &number_of_iterationsm1,
1623                                   &number_of_iterations, &inner_loop_cond))
1624     return NULL;
1625
1626   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1627   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1628   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1629   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1630   if (!integer_onep (assumptions))
1631     {
1632       /* We consider to vectorize this loop by versioning it under
1633          some assumptions.  In order to do this, we need to clear
1634          existing information computed by scev and niter analyzer.  */
1635       scev_reset_htab ();
1636       free_numbers_of_iterations_estimates (loop);
1637       /* Also set flag for this loop so that following scev and niter
1638          analysis are done under the assumptions.  */
1639       loop_constraint_set (loop, LOOP_C_FINITE);
1640       /* Also record the assumptions for versioning.  */
1641       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1642     }
1643
1644   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1645     {
1646       if (dump_enabled_p ())
1647         {
1648           dump_printf_loc (MSG_NOTE, vect_location,
1649                            "Symbolic number of iterations is ");
1650           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1651           dump_printf (MSG_NOTE, "\n");
1652         }
1653     }
1654
1655   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1656   if (inner_loop_cond)
1657     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1658       = loop_exit_ctrl_vec_info_type;
1659
1660   gcc_assert (!loop->aux);
1661   loop->aux = loop_vinfo;
1662   return loop_vinfo;
1663 }
1664
1665
1666
1667 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1668    statements update the vectorization factor.  */
1669
1670 static void
1671 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1672 {
1673   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1675   int nbbs = loop->num_nodes;
1676   poly_uint64 vectorization_factor;
1677   int i;
1678
1679   if (dump_enabled_p ())
1680     dump_printf_loc (MSG_NOTE, vect_location,
1681                      "=== vect_update_vf_for_slp ===\n");
1682
1683   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1684   gcc_assert (known_ne (vectorization_factor, 0U));
1685
1686   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1687      vectorization factor of the loop is the unrolling factor required by
1688      the SLP instances.  If that unrolling factor is 1, we say, that we
1689      perform pure SLP on loop - cross iteration parallelism is not
1690      exploited.  */
1691   bool only_slp_in_loop = true;
1692   for (i = 0; i < nbbs; i++)
1693     {
1694       basic_block bb = bbs[i];
1695       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1696            gsi_next (&si))
1697         {
1698           gimple *stmt = gsi_stmt (si);
1699           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1700           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1701               && STMT_VINFO_RELATED_STMT (stmt_info))
1702             {
1703               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1704               stmt_info = vinfo_for_stmt (stmt);
1705             }
1706           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1707                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1708               && !PURE_SLP_STMT (stmt_info))
1709             /* STMT needs both SLP and loop-based vectorization.  */
1710             only_slp_in_loop = false;
1711         }
1712     }
1713
1714   if (only_slp_in_loop)
1715     {
1716       dump_printf_loc (MSG_NOTE, vect_location,
1717                        "Loop contains only SLP stmts\n");
1718       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1719     }
1720   else
1721     {
1722       dump_printf_loc (MSG_NOTE, vect_location,
1723                        "Loop contains SLP and non-SLP stmts\n");
1724       /* Both the vectorization factor and unroll factor have the form
1725          current_vector_size * X for some rational X, so they must have
1726          a common multiple.  */
1727       vectorization_factor
1728         = force_common_multiple (vectorization_factor,
1729                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1730     }
1731
1732   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1733   if (dump_enabled_p ())
1734     {
1735       dump_printf_loc (MSG_NOTE, vect_location,
1736                        "Updating vectorization factor to ");
1737       dump_dec (MSG_NOTE, vectorization_factor);
1738       dump_printf (MSG_NOTE, ".\n");
1739     }
1740 }
1741
1742 /* Function vect_analyze_loop_operations.
1743
1744    Scan the loop stmts and make sure they are all vectorizable.  */
1745
1746 static bool
1747 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1748 {
1749   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1750   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1751   int nbbs = loop->num_nodes;
1752   int i;
1753   stmt_vec_info stmt_info;
1754   bool need_to_vectorize = false;
1755   bool ok;
1756
1757   if (dump_enabled_p ())
1758     dump_printf_loc (MSG_NOTE, vect_location,
1759                      "=== vect_analyze_loop_operations ===\n");
1760
1761   for (i = 0; i < nbbs; i++)
1762     {
1763       basic_block bb = bbs[i];
1764
1765       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1766            gsi_next (&si))
1767         {
1768           gphi *phi = si.phi ();
1769           ok = true;
1770
1771           stmt_info = vinfo_for_stmt (phi);
1772           if (dump_enabled_p ())
1773             {
1774               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1775               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1776             }
1777           if (virtual_operand_p (gimple_phi_result (phi)))
1778             continue;
1779
1780           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1781              (i.e., a phi in the tail of the outer-loop).  */
1782           if (! is_loop_header_bb_p (bb))
1783             {
1784               /* FORNOW: we currently don't support the case that these phis
1785                  are not used in the outerloop (unless it is double reduction,
1786                  i.e., this phi is vect_reduction_def), cause this case
1787                  requires to actually do something here.  */
1788               if (STMT_VINFO_LIVE_P (stmt_info)
1789                   && STMT_VINFO_DEF_TYPE (stmt_info)
1790                      != vect_double_reduction_def)
1791                 {
1792                   if (dump_enabled_p ())
1793                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1794                                      "Unsupported loop-closed phi in "
1795                                      "outer-loop.\n");
1796                   return false;
1797                 }
1798
1799               /* If PHI is used in the outer loop, we check that its operand
1800                  is defined in the inner loop.  */
1801               if (STMT_VINFO_RELEVANT_P (stmt_info))
1802                 {
1803                   tree phi_op;
1804                   gimple *op_def_stmt;
1805
1806                   if (gimple_phi_num_args (phi) != 1)
1807                     return false;
1808
1809                   phi_op = PHI_ARG_DEF (phi, 0);
1810                   if (TREE_CODE (phi_op) != SSA_NAME)
1811                     return false;
1812
1813                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1814                   if (gimple_nop_p (op_def_stmt)
1815                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1816                       || !vinfo_for_stmt (op_def_stmt))
1817                     return false;
1818
1819                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1820                         != vect_used_in_outer
1821                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1822                            != vect_used_in_outer_by_reduction)
1823                     return false;
1824                 }
1825
1826               continue;
1827             }
1828
1829           gcc_assert (stmt_info);
1830
1831           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1832                || STMT_VINFO_LIVE_P (stmt_info))
1833               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1834             {
1835               /* A scalar-dependence cycle that we don't support.  */
1836               if (dump_enabled_p ())
1837                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838                                  "not vectorized: scalar dependence cycle.\n");
1839               return false;
1840             }
1841
1842           if (STMT_VINFO_RELEVANT_P (stmt_info))
1843             {
1844               need_to_vectorize = true;
1845               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1846                   && ! PURE_SLP_STMT (stmt_info))
1847                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1848               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1849                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1850                        && ! PURE_SLP_STMT (stmt_info))
1851                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1852             }
1853
1854           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1855             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1856
1857           if (!ok)
1858             {
1859               if (dump_enabled_p ())
1860                 {
1861                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1862                                    "not vectorized: relevant phi not "
1863                                    "supported: ");
1864                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1865                 }
1866               return false;
1867             }
1868         }
1869
1870       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1871            gsi_next (&si))
1872         {
1873           gimple *stmt = gsi_stmt (si);
1874           if (!gimple_clobber_p (stmt)
1875               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1876             return false;
1877         }
1878     } /* bbs */
1879
1880   /* All operations in the loop are either irrelevant (deal with loop
1881      control, or dead), or only used outside the loop and can be moved
1882      out of the loop (e.g. invariants, inductions).  The loop can be
1883      optimized away by scalar optimizations.  We're better off not
1884      touching this loop.  */
1885   if (!need_to_vectorize)
1886     {
1887       if (dump_enabled_p ())
1888         dump_printf_loc (MSG_NOTE, vect_location,
1889                          "All the computation can be taken out of the loop.\n");
1890       if (dump_enabled_p ())
1891         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892                          "not vectorized: redundant loop. no profit to "
1893                          "vectorize.\n");
1894       return false;
1895     }
1896
1897   return true;
1898 }
1899
1900 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1901    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1902    definitely no, or -1 if it's worth retrying.  */
1903
1904 static int
1905 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1906 {
1907   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1908   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1909
1910   /* Only fully-masked loops can have iteration counts less than the
1911      vectorization factor.  */
1912   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1913     {
1914       HOST_WIDE_INT max_niter;
1915
1916       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1917         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1918       else
1919         max_niter = max_stmt_executions_int (loop);
1920
1921       if (max_niter != -1
1922           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1923         {
1924           if (dump_enabled_p ())
1925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1926                              "not vectorized: iteration count smaller than "
1927                              "vectorization factor.\n");
1928           return 0;
1929         }
1930     }
1931
1932   int min_profitable_iters, min_profitable_estimate;
1933   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1934                                       &min_profitable_estimate);
1935
1936   if (min_profitable_iters < 0)
1937     {
1938       if (dump_enabled_p ())
1939         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940                          "not vectorized: vectorization not profitable.\n");
1941       if (dump_enabled_p ())
1942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943                          "not vectorized: vector version will never be "
1944                          "profitable.\n");
1945       return -1;
1946     }
1947
1948   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1949                                * assumed_vf);
1950
1951   /* Use the cost model only if it is more conservative than user specified
1952      threshold.  */
1953   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1954                                     min_profitable_iters);
1955
1956   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1957
1958   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1959       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1960     {
1961       if (dump_enabled_p ())
1962         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1963                          "not vectorized: vectorization not profitable.\n");
1964       if (dump_enabled_p ())
1965         dump_printf_loc (MSG_NOTE, vect_location,
1966                          "not vectorized: iteration count smaller than user "
1967                          "specified loop bound parameter or minimum profitable "
1968                          "iterations (whichever is more conservative).\n");
1969       return 0;
1970     }
1971
1972   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1973   if (estimated_niter == -1)
1974     estimated_niter = likely_max_stmt_executions_int (loop);
1975   if (estimated_niter != -1
1976       && ((unsigned HOST_WIDE_INT) estimated_niter
1977           < MAX (th, (unsigned) min_profitable_estimate)))
1978     {
1979       if (dump_enabled_p ())
1980         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981                          "not vectorized: estimated iteration count too "
1982                          "small.\n");
1983       if (dump_enabled_p ())
1984         dump_printf_loc (MSG_NOTE, vect_location,
1985                          "not vectorized: estimated iteration count smaller "
1986                          "than specified loop bound parameter or minimum "
1987                          "profitable iterations (whichever is more "
1988                          "conservative).\n");
1989       return -1;
1990     }
1991
1992   return 1;
1993 }
1994
1995
1996 /* Function vect_analyze_loop_2.
1997
1998    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1999    for it.  The different analyses will record information in the
2000    loop_vec_info struct.  */
2001 static bool
2002 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2003 {
2004   bool ok;
2005   int res;
2006   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2007   poly_uint64 min_vf = 2;
2008   unsigned int n_stmts = 0;
2009
2010   /* The first group of checks is independent of the vector size.  */
2011   fatal = true;
2012
2013   /* Find all data references in the loop (which correspond to vdefs/vuses)
2014      and analyze their evolution in the loop.  */
2015
2016   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2017
2018   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2019   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2020     {
2021       if (dump_enabled_p ())
2022         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2023                          "not vectorized: loop nest containing two "
2024                          "or more consecutive inner loops cannot be "
2025                          "vectorized\n");
2026       return false;
2027     }
2028
2029   for (unsigned i = 0; i < loop->num_nodes; i++)
2030     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2031          !gsi_end_p (gsi); gsi_next (&gsi))
2032       {
2033         gimple *stmt = gsi_stmt (gsi);
2034         if (is_gimple_debug (stmt))
2035           continue;
2036         ++n_stmts;
2037         if (!find_data_references_in_stmt (loop, stmt,
2038                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
2039           {
2040             if (is_gimple_call (stmt) && loop->safelen)
2041               {
2042                 tree fndecl = gimple_call_fndecl (stmt), op;
2043                 if (fndecl != NULL_TREE)
2044                   {
2045                     cgraph_node *node = cgraph_node::get (fndecl);
2046                     if (node != NULL && node->simd_clones != NULL)
2047                       {
2048                         unsigned int j, n = gimple_call_num_args (stmt);
2049                         for (j = 0; j < n; j++)
2050                           {
2051                             op = gimple_call_arg (stmt, j);
2052                             if (DECL_P (op)
2053                                 || (REFERENCE_CLASS_P (op)
2054                                     && get_base_address (op)))
2055                               break;
2056                           }
2057                         op = gimple_call_lhs (stmt);
2058                         /* Ignore #pragma omp declare simd functions
2059                            if they don't have data references in the
2060                            call stmt itself.  */
2061                         if (j == n
2062                             && !(op
2063                                  && (DECL_P (op)
2064                                      || (REFERENCE_CLASS_P (op)
2065                                          && get_base_address (op)))))
2066                           continue;
2067                       }
2068                   }
2069               }
2070             if (dump_enabled_p ())
2071               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2072                                "not vectorized: loop contains function "
2073                                "calls or data references that cannot "
2074                                "be analyzed\n");
2075             return false;
2076           }
2077       }
2078
2079   /* Analyze the data references and also adjust the minimal
2080      vectorization factor according to the loads and stores.  */
2081
2082   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2083   if (!ok)
2084     {
2085       if (dump_enabled_p ())
2086         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2087                          "bad data references.\n");
2088       return false;
2089     }
2090
2091   /* Classify all cross-iteration scalar data-flow cycles.
2092      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2093   vect_analyze_scalar_cycles (loop_vinfo);
2094
2095   vect_pattern_recog (loop_vinfo);
2096
2097   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2098
2099   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2100      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2101
2102   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2103   if (!ok)
2104     {
2105       if (dump_enabled_p ())
2106         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2107                          "bad data access.\n");
2108       return false;
2109     }
2110
2111   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2112
2113   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2114   if (!ok)
2115     {
2116       if (dump_enabled_p ())
2117         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2118                          "unexpected pattern.\n");
2119       return false;
2120     }
2121
2122   /* While the rest of the analysis below depends on it in some way.  */
2123   fatal = false;
2124
2125   /* Analyze data dependences between the data-refs in the loop
2126      and adjust the maximum vectorization factor according to
2127      the dependences.
2128      FORNOW: fail at the first data dependence that we encounter.  */
2129
2130   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2131   if (!ok
2132       || (max_vf != MAX_VECTORIZATION_FACTOR
2133           && maybe_lt (max_vf, min_vf)))
2134     {
2135       if (dump_enabled_p ())
2136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2137                              "bad data dependence.\n");
2138       return false;
2139     }
2140   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2141
2142   ok = vect_determine_vectorization_factor (loop_vinfo);
2143   if (!ok)
2144     {
2145       if (dump_enabled_p ())
2146         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147                          "can't determine vectorization factor.\n");
2148       return false;
2149     }
2150   if (max_vf != MAX_VECTORIZATION_FACTOR
2151       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2152     {
2153       if (dump_enabled_p ())
2154         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2155                          "bad data dependence.\n");
2156       return false;
2157     }
2158
2159   /* Compute the scalar iteration cost.  */
2160   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2161
2162   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2163   unsigned th;
2164
2165   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2166   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2167   if (!ok)
2168     return false;
2169
2170   /* If there are any SLP instances mark them as pure_slp.  */
2171   bool slp = vect_make_slp_decision (loop_vinfo);
2172   if (slp)
2173     {
2174       /* Find stmts that need to be both vectorized and SLPed.  */
2175       vect_detect_hybrid_slp (loop_vinfo);
2176
2177       /* Update the vectorization factor based on the SLP decision.  */
2178       vect_update_vf_for_slp (loop_vinfo);
2179     }
2180
2181   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2182
2183   /* We don't expect to have to roll back to anything other than an empty
2184      set of rgroups.  */
2185   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2186
2187   /* This is the point where we can re-start analysis with SLP forced off.  */
2188 start_over:
2189
2190   /* Now the vectorization factor is final.  */
2191   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2192   gcc_assert (known_ne (vectorization_factor, 0U));
2193
2194   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2195     {
2196       dump_printf_loc (MSG_NOTE, vect_location,
2197                        "vectorization_factor = ");
2198       dump_dec (MSG_NOTE, vectorization_factor);
2199       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2200                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2201     }
2202
2203   HOST_WIDE_INT max_niter
2204     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2205
2206   /* Analyze the alignment of the data-refs in the loop.
2207      Fail if a data reference is found that cannot be vectorized.  */
2208
2209   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2210   if (!ok)
2211     {
2212       if (dump_enabled_p ())
2213         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214                          "bad data alignment.\n");
2215       return false;
2216     }
2217
2218   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2219      It is important to call pruning after vect_analyze_data_ref_accesses,
2220      since we use grouping information gathered by interleaving analysis.  */
2221   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2222   if (!ok)
2223     return false;
2224
2225   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2226      vectorization.  */
2227   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2228     {
2229     /* This pass will decide on using loop versioning and/or loop peeling in
2230        order to enhance the alignment of data references in the loop.  */
2231     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2232     if (!ok)
2233       {
2234         if (dump_enabled_p ())
2235           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236                            "bad data alignment.\n");
2237         return false;
2238       }
2239     }
2240
2241   if (slp)
2242     {
2243       /* Analyze operations in the SLP instances.  Note this may
2244          remove unsupported SLP instances which makes the above
2245          SLP kind detection invalid.  */
2246       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2247       vect_slp_analyze_operations (loop_vinfo);
2248       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2249         goto again;
2250     }
2251
2252   /* Scan all the remaining operations in the loop that are not subject
2253      to SLP and make sure they are vectorizable.  */
2254   ok = vect_analyze_loop_operations (loop_vinfo);
2255   if (!ok)
2256     {
2257       if (dump_enabled_p ())
2258         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259                          "bad operation or unsupported loop bound.\n");
2260       return false;
2261     }
2262
2263   if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2264       && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2265     {
2266       LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
2267       if (dump_enabled_p ())
2268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269                          "can't use a fully-masked loop because peeling for"
2270                          " gaps is required.\n");
2271     }
2272
2273   /* Decide whether to use a fully-masked loop for this vectorization
2274      factor.  */
2275   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2276     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2277        && vect_verify_full_masking (loop_vinfo));
2278   if (dump_enabled_p ())
2279     {
2280       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2281         dump_printf_loc (MSG_NOTE, vect_location,
2282                          "using a fully-masked loop.\n");
2283       else
2284         dump_printf_loc (MSG_NOTE, vect_location,
2285                          "not using a fully-masked loop.\n");
2286     }
2287
2288   /* If epilog loop is required because of data accesses with gaps,
2289      one additional iteration needs to be peeled.  Check if there is
2290      enough iterations for vectorization.  */
2291   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2292       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2293       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2294     {
2295       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2296       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2297
2298       if (known_lt (wi::to_widest (scalar_niters), vf))
2299         {
2300           if (dump_enabled_p ())
2301             dump_printf_loc (MSG_NOTE, vect_location,
2302                              "loop has no enough iterations to support"
2303                              " peeling for gaps.\n");
2304           return false;
2305         }
2306     }
2307
2308   /* Check the costings of the loop make vectorizing worthwhile.  */
2309   res = vect_analyze_loop_costing (loop_vinfo);
2310   if (res < 0)
2311     goto again;
2312   if (!res)
2313     {
2314       if (dump_enabled_p ())
2315         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316                          "Loop costings not worthwhile.\n");
2317       return false;
2318     }
2319
2320   /* Decide whether we need to create an epilogue loop to handle
2321      remaining scalar iterations.  */
2322   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2323
2324   unsigned HOST_WIDE_INT const_vf;
2325   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2326     /* The main loop handles all iterations.  */
2327     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2328   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2329            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2330     {
2331       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2332                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2333                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2334         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2335     }
2336   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2337            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2338            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2339                 < (unsigned) exact_log2 (const_vf))
2340                /* In case of versioning, check if the maximum number of
2341                   iterations is greater than th.  If they are identical,
2342                   the epilogue is unnecessary.  */
2343                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2344                    || ((unsigned HOST_WIDE_INT) max_niter
2345                        > (th / const_vf) * const_vf))))
2346     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2347
2348   /* If an epilogue loop is required make sure we can create one.  */
2349   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2350       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2351     {
2352       if (dump_enabled_p ())
2353         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2354       if (!vect_can_advance_ivs_p (loop_vinfo)
2355           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2356                                            single_exit (LOOP_VINFO_LOOP
2357                                                          (loop_vinfo))))
2358         {
2359           if (dump_enabled_p ())
2360             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                              "not vectorized: can't create required "
2362                              "epilog loop\n");
2363           goto again;
2364         }
2365     }
2366
2367   /* During peeling, we need to check if number of loop iterations is
2368      enough for both peeled prolog loop and vector loop.  This check
2369      can be merged along with threshold check of loop versioning, so
2370      increase threshold for this case if necessary.  */
2371   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2372     {
2373       poly_uint64 niters_th = 0;
2374
2375       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2376         {
2377           /* Niters for peeled prolog loop.  */
2378           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2379             {
2380               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2381               tree vectype
2382                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2383               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2384             }
2385           else
2386             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2387         }
2388
2389       /* Niters for at least one iteration of vectorized loop.  */
2390       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2391         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2392       /* One additional iteration because of peeling for gap.  */
2393       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2394         niters_th += 1;
2395       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2396     }
2397
2398   gcc_assert (known_eq (vectorization_factor,
2399                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2400
2401   /* Ok to vectorize!  */
2402   return true;
2403
2404 again:
2405   /* Try again with SLP forced off but if we didn't do any SLP there is
2406      no point in re-trying.  */
2407   if (!slp)
2408     return false;
2409
2410   /* If there are reduction chains re-trying will fail anyway.  */
2411   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2412     return false;
2413
2414   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2415      via interleaving or lane instructions.  */
2416   slp_instance instance;
2417   slp_tree node;
2418   unsigned i, j;
2419   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2420     {
2421       stmt_vec_info vinfo;
2422       vinfo = vinfo_for_stmt
2423           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2424       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2425         continue;
2426       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2427       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2428       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2429       if (! vect_store_lanes_supported (vectype, size, false)
2430           && ! vect_grouped_store_supported (vectype, size))
2431         return false;
2432       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2433         {
2434           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2435           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2436           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2437           size = STMT_VINFO_GROUP_SIZE (vinfo);
2438           vectype = STMT_VINFO_VECTYPE (vinfo);
2439           if (! vect_load_lanes_supported (vectype, size, false)
2440               && ! vect_grouped_load_supported (vectype, single_element_p,
2441                                                 size))
2442             return false;
2443         }
2444     }
2445
2446   if (dump_enabled_p ())
2447     dump_printf_loc (MSG_NOTE, vect_location,
2448                      "re-trying with SLP disabled\n");
2449
2450   /* Roll back state appropriately.  No SLP this time.  */
2451   slp = false;
2452   /* Restore vectorization factor as it were without SLP.  */
2453   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2454   /* Free the SLP instances.  */
2455   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2456     vect_free_slp_instance (instance);
2457   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2458   /* Reset SLP type to loop_vect on all stmts.  */
2459   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2460     {
2461       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2462       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2463            !gsi_end_p (si); gsi_next (&si))
2464         {
2465           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2466           STMT_SLP_TYPE (stmt_info) = loop_vect;
2467         }
2468       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2469            !gsi_end_p (si); gsi_next (&si))
2470         {
2471           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2472           STMT_SLP_TYPE (stmt_info) = loop_vect;
2473           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2474             {
2475               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2476               STMT_SLP_TYPE (stmt_info) = loop_vect;
2477               for (gimple_stmt_iterator pi
2478                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2479                    !gsi_end_p (pi); gsi_next (&pi))
2480                 {
2481                   gimple *pstmt = gsi_stmt (pi);
2482                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2483                 }
2484             }
2485         }
2486     }
2487   /* Free optimized alias test DDRS.  */
2488   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2489   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2490   /* Reset target cost data.  */
2491   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2492   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2493     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2494   /* Reset accumulated rgroup information.  */
2495   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2496   /* Reset assorted flags.  */
2497   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2498   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2499   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2500   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2501   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2502
2503   goto start_over;
2504 }
2505
2506 /* Function vect_analyze_loop.
2507
2508    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2509    for it.  The different analyses will record information in the
2510    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2511    be vectorized.  */
2512 loop_vec_info
2513 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2514 {
2515   loop_vec_info loop_vinfo;
2516   auto_vector_sizes vector_sizes;
2517
2518   /* Autodetect first vector size we try.  */
2519   current_vector_size = 0;
2520   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2521   unsigned int next_size = 0;
2522
2523   if (dump_enabled_p ())
2524     dump_printf_loc (MSG_NOTE, vect_location,
2525                      "===== analyze_loop_nest =====\n");
2526
2527   if (loop_outer (loop)
2528       && loop_vec_info_for_loop (loop_outer (loop))
2529       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2530     {
2531       if (dump_enabled_p ())
2532         dump_printf_loc (MSG_NOTE, vect_location,
2533                          "outer-loop already vectorized.\n");
2534       return NULL;
2535     }
2536
2537   poly_uint64 autodetected_vector_size = 0;
2538   while (1)
2539     {
2540       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2541       loop_vinfo = vect_analyze_loop_form (loop);
2542       if (!loop_vinfo)
2543         {
2544           if (dump_enabled_p ())
2545             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2546                              "bad loop form.\n");
2547           return NULL;
2548         }
2549
2550       bool fatal = false;
2551
2552       if (orig_loop_vinfo)
2553         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2554
2555       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2556         {
2557           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2558
2559           return loop_vinfo;
2560         }
2561
2562       delete loop_vinfo;
2563
2564       if (next_size == 0)
2565         autodetected_vector_size = current_vector_size;
2566
2567       if (next_size < vector_sizes.length ()
2568           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2569         next_size += 1;
2570
2571       if (fatal
2572           || next_size == vector_sizes.length ()
2573           || known_eq (current_vector_size, 0U))
2574         return NULL;
2575
2576       /* Try the next biggest vector size.  */
2577       current_vector_size = vector_sizes[next_size++];
2578       if (dump_enabled_p ())
2579         {
2580           dump_printf_loc (MSG_NOTE, vect_location,
2581                            "***** Re-trying analysis with "
2582                            "vector size ");
2583           dump_dec (MSG_NOTE, current_vector_size);
2584           dump_printf (MSG_NOTE, "\n");
2585         }
2586     }
2587 }
2588
2589
2590 /* Function reduction_fn_for_scalar_code
2591
2592    Input:
2593    CODE - tree_code of a reduction operations.
2594
2595    Output:
2596    REDUC_FN - the corresponding internal function to be used to reduce the
2597       vector of partial results into a single scalar result, or IFN_LAST
2598       if the operation is a supported reduction operation, but does not have
2599       such an internal function.
2600
2601    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2602
2603 static bool
2604 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2605 {
2606   switch (code)
2607     {
2608       case MAX_EXPR:
2609         *reduc_fn = IFN_REDUC_MAX;
2610         return true;
2611
2612       case MIN_EXPR:
2613         *reduc_fn = IFN_REDUC_MIN;
2614         return true;
2615
2616       case PLUS_EXPR:
2617         *reduc_fn = IFN_REDUC_PLUS;
2618         return true;
2619
2620       case BIT_AND_EXPR:
2621         *reduc_fn = IFN_REDUC_AND;
2622         return true;
2623
2624       case BIT_IOR_EXPR:
2625         *reduc_fn = IFN_REDUC_IOR;
2626         return true;
2627
2628       case BIT_XOR_EXPR:
2629         *reduc_fn = IFN_REDUC_XOR;
2630         return true;
2631
2632       case MULT_EXPR:
2633       case MINUS_EXPR:
2634         *reduc_fn = IFN_LAST;
2635         return true;
2636
2637       default:
2638        return false;
2639     }
2640 }
2641
2642 /* If there is a neutral value X such that SLP reduction NODE would not
2643    be affected by the introduction of additional X elements, return that X,
2644    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2645    is true if the SLP statements perform a single reduction, false if each
2646    statement performs an independent reduction.  */
2647
2648 static tree
2649 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2650                               bool reduc_chain)
2651 {
2652   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2653   gimple *stmt = stmts[0];
2654   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2655   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2656   tree scalar_type = TREE_TYPE (vector_type);
2657   struct loop *loop = gimple_bb (stmt)->loop_father;
2658   gcc_assert (loop);
2659
2660   switch (code)
2661     {
2662     case WIDEN_SUM_EXPR:
2663     case DOT_PROD_EXPR:
2664     case SAD_EXPR:
2665     case PLUS_EXPR:
2666     case MINUS_EXPR:
2667     case BIT_IOR_EXPR:
2668     case BIT_XOR_EXPR:
2669       return build_zero_cst (scalar_type);
2670
2671     case MULT_EXPR:
2672       return build_one_cst (scalar_type);
2673
2674     case BIT_AND_EXPR:
2675       return build_all_ones_cst (scalar_type);
2676
2677     case MAX_EXPR:
2678     case MIN_EXPR:
2679       /* For MIN/MAX the initial values are neutral.  A reduction chain
2680          has only a single initial value, so that value is neutral for
2681          all statements.  */
2682       if (reduc_chain)
2683         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2684       return NULL_TREE;
2685
2686     default:
2687       return NULL_TREE;
2688     }
2689 }
2690
2691 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2692    STMT is printed with a message MSG. */
2693
2694 static void
2695 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2696 {
2697   dump_printf_loc (msg_type, vect_location, "%s", msg);
2698   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2699 }
2700
2701
2702 /* Detect SLP reduction of the form:
2703
2704    #a1 = phi <a5, a0>
2705    a2 = operation (a1)
2706    a3 = operation (a2)
2707    a4 = operation (a3)
2708    a5 = operation (a4)
2709
2710    #a = phi <a5>
2711
2712    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2713    FIRST_STMT is the first reduction stmt in the chain
2714    (a2 = operation (a1)).
2715
2716    Return TRUE if a reduction chain was detected.  */
2717
2718 static bool
2719 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2720                        gimple *first_stmt)
2721 {
2722   struct loop *loop = (gimple_bb (phi))->loop_father;
2723   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2724   enum tree_code code;
2725   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2726   stmt_vec_info use_stmt_info, current_stmt_info;
2727   tree lhs;
2728   imm_use_iterator imm_iter;
2729   use_operand_p use_p;
2730   int nloop_uses, size = 0, n_out_of_loop_uses;
2731   bool found = false;
2732
2733   if (loop != vect_loop)
2734     return false;
2735
2736   lhs = PHI_RESULT (phi);
2737   code = gimple_assign_rhs_code (first_stmt);
2738   while (1)
2739     {
2740       nloop_uses = 0;
2741       n_out_of_loop_uses = 0;
2742       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2743         {
2744           gimple *use_stmt = USE_STMT (use_p);
2745           if (is_gimple_debug (use_stmt))
2746             continue;
2747
2748           /* Check if we got back to the reduction phi.  */
2749           if (use_stmt == phi)
2750             {
2751               loop_use_stmt = use_stmt;
2752               found = true;
2753               break;
2754             }
2755
2756           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2757             {
2758               loop_use_stmt = use_stmt;
2759               nloop_uses++;
2760             }
2761            else
2762              n_out_of_loop_uses++;
2763
2764            /* There are can be either a single use in the loop or two uses in
2765               phi nodes.  */
2766            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2767              return false;
2768         }
2769
2770       if (found)
2771         break;
2772
2773       /* We reached a statement with no loop uses.  */
2774       if (nloop_uses == 0)
2775         return false;
2776
2777       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2778       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2779         return false;
2780
2781       if (!is_gimple_assign (loop_use_stmt)
2782           || code != gimple_assign_rhs_code (loop_use_stmt)
2783           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2784         return false;
2785
2786       /* Insert USE_STMT into reduction chain.  */
2787       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2788       if (current_stmt)
2789         {
2790           current_stmt_info = vinfo_for_stmt (current_stmt);
2791           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2792           GROUP_FIRST_ELEMENT (use_stmt_info)
2793             = GROUP_FIRST_ELEMENT (current_stmt_info);
2794         }
2795       else
2796         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2797
2798       lhs = gimple_assign_lhs (loop_use_stmt);
2799       current_stmt = loop_use_stmt;
2800       size++;
2801    }
2802
2803   if (!found || loop_use_stmt != phi || size < 2)
2804     return false;
2805
2806   /* Swap the operands, if needed, to make the reduction operand be the second
2807      operand.  */
2808   lhs = PHI_RESULT (phi);
2809   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2810   while (next_stmt)
2811     {
2812       if (gimple_assign_rhs2 (next_stmt) == lhs)
2813         {
2814           tree op = gimple_assign_rhs1 (next_stmt);
2815           gimple *def_stmt = NULL;
2816
2817           if (TREE_CODE (op) == SSA_NAME)
2818             def_stmt = SSA_NAME_DEF_STMT (op);
2819
2820           /* Check that the other def is either defined in the loop
2821              ("vect_internal_def"), or it's an induction (defined by a
2822              loop-header phi-node).  */
2823           if (def_stmt
2824               && gimple_bb (def_stmt)
2825               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2826               && (is_gimple_assign (def_stmt)
2827                   || is_gimple_call (def_stmt)
2828                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2829                            == vect_induction_def
2830                   || (gimple_code (def_stmt) == GIMPLE_PHI
2831                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2832                                   == vect_internal_def
2833                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2834             {
2835               lhs = gimple_assign_lhs (next_stmt);
2836               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2837               continue;
2838             }
2839
2840           return false;
2841         }
2842       else
2843         {
2844           tree op = gimple_assign_rhs2 (next_stmt);
2845           gimple *def_stmt = NULL;
2846
2847           if (TREE_CODE (op) == SSA_NAME)
2848             def_stmt = SSA_NAME_DEF_STMT (op);
2849
2850           /* Check that the other def is either defined in the loop
2851             ("vect_internal_def"), or it's an induction (defined by a
2852             loop-header phi-node).  */
2853           if (def_stmt
2854               && gimple_bb (def_stmt)
2855               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2856               && (is_gimple_assign (def_stmt)
2857                   || is_gimple_call (def_stmt)
2858                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2859                               == vect_induction_def
2860                   || (gimple_code (def_stmt) == GIMPLE_PHI
2861                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2862                                   == vect_internal_def
2863                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2864             {
2865               if (dump_enabled_p ())
2866                 {
2867                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2868                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2869                 }
2870
2871               swap_ssa_operands (next_stmt,
2872                                  gimple_assign_rhs1_ptr (next_stmt),
2873                                  gimple_assign_rhs2_ptr (next_stmt));
2874               update_stmt (next_stmt);
2875
2876               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2877                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2878             }
2879           else
2880             return false;
2881         }
2882
2883       lhs = gimple_assign_lhs (next_stmt);
2884       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2885     }
2886
2887   /* Save the chain for further analysis in SLP detection.  */
2888   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2889   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2890   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2891
2892   return true;
2893 }
2894
2895
2896 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2897    reduction operation CODE has a handled computation expression.  */
2898
2899 bool
2900 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2901                       enum tree_code code)
2902 {
2903   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2904   auto_bitmap visited;
2905   tree lookfor = PHI_RESULT (phi);
2906   ssa_op_iter curri;
2907   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2908   while (USE_FROM_PTR (curr) != loop_arg)
2909     curr = op_iter_next_use (&curri);
2910   curri.i = curri.numops;
2911   do
2912     {
2913       path.safe_push (std::make_pair (curri, curr));
2914       tree use = USE_FROM_PTR (curr);
2915       if (use == lookfor)
2916         break;
2917       gimple *def = SSA_NAME_DEF_STMT (use);
2918       if (gimple_nop_p (def)
2919           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2920         {
2921 pop:
2922           do
2923             {
2924               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2925               curri = x.first;
2926               curr = x.second;
2927               do
2928                 curr = op_iter_next_use (&curri);
2929               /* Skip already visited or non-SSA operands (from iterating
2930                  over PHI args).  */
2931               while (curr != NULL_USE_OPERAND_P
2932                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2933                          || ! bitmap_set_bit (visited,
2934                                               SSA_NAME_VERSION
2935                                                 (USE_FROM_PTR (curr)))));
2936             }
2937           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2938           if (curr == NULL_USE_OPERAND_P)
2939             break;
2940         }
2941       else
2942         {
2943           if (gimple_code (def) == GIMPLE_PHI)
2944             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2945           else
2946             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2947           while (curr != NULL_USE_OPERAND_P
2948                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2949                      || ! bitmap_set_bit (visited,
2950                                           SSA_NAME_VERSION
2951                                             (USE_FROM_PTR (curr)))))
2952             curr = op_iter_next_use (&curri);
2953           if (curr == NULL_USE_OPERAND_P)
2954             goto pop;
2955         }
2956     }
2957   while (1);
2958   if (dump_file && (dump_flags & TDF_DETAILS))
2959     {
2960       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2961       unsigned i;
2962       std::pair<ssa_op_iter, use_operand_p> *x;
2963       FOR_EACH_VEC_ELT (path, i, x)
2964         {
2965           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2966           dump_printf (MSG_NOTE, " ");
2967         }
2968       dump_printf (MSG_NOTE, "\n");
2969     }
2970
2971   /* Check whether the reduction path detected is valid.  */
2972   bool fail = path.length () == 0;
2973   bool neg = false;
2974   for (unsigned i = 1; i < path.length (); ++i)
2975     {
2976       gimple *use_stmt = USE_STMT (path[i].second);
2977       tree op = USE_FROM_PTR (path[i].second);
2978       if (! has_single_use (op)
2979           || ! is_gimple_assign (use_stmt))
2980         {
2981           fail = true;
2982           break;
2983         }
2984       if (gimple_assign_rhs_code (use_stmt) != code)
2985         {
2986           if (code == PLUS_EXPR
2987               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2988             {
2989               /* Track whether we negate the reduction value each iteration.  */
2990               if (gimple_assign_rhs2 (use_stmt) == op)
2991                 neg = ! neg;
2992             }
2993           else
2994             {
2995               fail = true;
2996               break;
2997             }
2998         }
2999     }
3000   return ! fail && ! neg;
3001 }
3002
3003
3004 /* Function vect_is_simple_reduction
3005
3006    (1) Detect a cross-iteration def-use cycle that represents a simple
3007    reduction computation.  We look for the following pattern:
3008
3009    loop_header:
3010      a1 = phi < a0, a2 >
3011      a3 = ...
3012      a2 = operation (a3, a1)
3013
3014    or
3015
3016    a3 = ...
3017    loop_header:
3018      a1 = phi < a0, a2 >
3019      a2 = operation (a3, a1)
3020
3021    such that:
3022    1. operation is commutative and associative and it is safe to
3023       change the order of the computation
3024    2. no uses for a2 in the loop (a2 is used out of the loop)
3025    3. no uses of a1 in the loop besides the reduction operation
3026    4. no uses of a1 outside the loop.
3027
3028    Conditions 1,4 are tested here.
3029    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3030
3031    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3032    nested cycles.
3033
3034    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3035    reductions:
3036
3037      a1 = phi < a0, a2 >
3038      inner loop (def of a3)
3039      a2 = phi < a3 >
3040
3041    (4) Detect condition expressions, ie:
3042      for (int i = 0; i < N; i++)
3043        if (a[i] < val)
3044         ret_val = a[i];
3045
3046 */
3047
3048 static gimple *
3049 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3050                           bool *double_reduc,
3051                           bool need_wrapping_integral_overflow,
3052                           enum vect_reduction_type *v_reduc_type)
3053 {
3054   struct loop *loop = (gimple_bb (phi))->loop_father;
3055   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3056   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3057   enum tree_code orig_code, code;
3058   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3059   tree type;
3060   int nloop_uses;
3061   tree name;
3062   imm_use_iterator imm_iter;
3063   use_operand_p use_p;
3064   bool phi_def;
3065
3066   *double_reduc = false;
3067   *v_reduc_type = TREE_CODE_REDUCTION;
3068
3069   tree phi_name = PHI_RESULT (phi);
3070   /* ???  If there are no uses of the PHI result the inner loop reduction
3071      won't be detected as possibly double-reduction by vectorizable_reduction
3072      because that tries to walk the PHI arg from the preheader edge which
3073      can be constant.  See PR60382.  */
3074   if (has_zero_uses (phi_name))
3075     return NULL;
3076   nloop_uses = 0;
3077   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3078     {
3079       gimple *use_stmt = USE_STMT (use_p);
3080       if (is_gimple_debug (use_stmt))
3081         continue;
3082
3083       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3084         {
3085           if (dump_enabled_p ())
3086             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3087                              "intermediate value used outside loop.\n");
3088
3089           return NULL;
3090         }
3091
3092       nloop_uses++;
3093       if (nloop_uses > 1)
3094         {
3095           if (dump_enabled_p ())
3096             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3097                              "reduction value used in loop.\n");
3098           return NULL;
3099         }
3100
3101       phi_use_stmt = use_stmt;
3102     }
3103
3104   edge latch_e = loop_latch_edge (loop);
3105   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3106   if (TREE_CODE (loop_arg) != SSA_NAME)
3107     {
3108       if (dump_enabled_p ())
3109         {
3110           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3111                            "reduction: not ssa_name: ");
3112           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3113           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3114         }
3115       return NULL;
3116     }
3117
3118   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3119   if (is_gimple_assign (def_stmt))
3120     {
3121       name = gimple_assign_lhs (def_stmt);
3122       phi_def = false;
3123     }
3124   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3125     {
3126       name = PHI_RESULT (def_stmt);
3127       phi_def = true;
3128     }
3129   else
3130     {
3131       if (dump_enabled_p ())
3132         {
3133           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3134                            "reduction: unhandled reduction operation: ");
3135           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3136         }
3137       return NULL;
3138     }
3139
3140   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3141     return NULL;
3142
3143   nloop_uses = 0;
3144   auto_vec<gphi *, 3> lcphis;
3145   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3146     {
3147       gimple *use_stmt = USE_STMT (use_p);
3148       if (is_gimple_debug (use_stmt))
3149         continue;
3150       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3151         nloop_uses++;
3152       else
3153         /* We can have more than one loop-closed PHI.  */
3154         lcphis.safe_push (as_a <gphi *> (use_stmt));
3155       if (nloop_uses > 1)
3156         {
3157           if (dump_enabled_p ())
3158             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3159                              "reduction used in loop.\n");
3160           return NULL;
3161         }
3162     }
3163
3164   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3165      defined in the inner loop.  */
3166   if (phi_def)
3167     {
3168       op1 = PHI_ARG_DEF (def_stmt, 0);
3169
3170       if (gimple_phi_num_args (def_stmt) != 1
3171           || TREE_CODE (op1) != SSA_NAME)
3172         {
3173           if (dump_enabled_p ())
3174             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3175                              "unsupported phi node definition.\n");
3176
3177           return NULL;
3178         }
3179
3180       def1 = SSA_NAME_DEF_STMT (op1);
3181       if (gimple_bb (def1)
3182           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3183           && loop->inner
3184           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3185           && is_gimple_assign (def1)
3186           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3187         {
3188           if (dump_enabled_p ())
3189             report_vect_op (MSG_NOTE, def_stmt,
3190                             "detected double reduction: ");
3191
3192           *double_reduc = true;
3193           return def_stmt;
3194         }
3195
3196       return NULL;
3197     }
3198
3199   /* If we are vectorizing an inner reduction we are executing that
3200      in the original order only in case we are not dealing with a
3201      double reduction.  */
3202   bool check_reduction = true;
3203   if (flow_loop_nested_p (vect_loop, loop))
3204     {
3205       gphi *lcphi;
3206       unsigned i;
3207       check_reduction = false;
3208       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3209         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3210           {
3211             gimple *use_stmt = USE_STMT (use_p);
3212             if (is_gimple_debug (use_stmt))
3213               continue;
3214             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3215               check_reduction = true;
3216           }
3217     }
3218
3219   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3220   code = orig_code = gimple_assign_rhs_code (def_stmt);
3221
3222   /* We can handle "res -= x[i]", which is non-associative by
3223      simply rewriting this into "res += -x[i]".  Avoid changing
3224      gimple instruction for the first simple tests and only do this
3225      if we're allowed to change code at all.  */
3226   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3227     code = PLUS_EXPR;
3228
3229   if (code == COND_EXPR)
3230     {
3231       if (! nested_in_vect_loop)
3232         *v_reduc_type = COND_REDUCTION;
3233
3234       op3 = gimple_assign_rhs1 (def_stmt);
3235       if (COMPARISON_CLASS_P (op3))
3236         {
3237           op4 = TREE_OPERAND (op3, 1);
3238           op3 = TREE_OPERAND (op3, 0);
3239         }
3240       if (op3 == phi_name || op4 == phi_name)
3241         {
3242           if (dump_enabled_p ())
3243             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3244                             "reduction: condition depends on previous"
3245                             " iteration: ");
3246           return NULL;
3247         }
3248
3249       op1 = gimple_assign_rhs2 (def_stmt);
3250       op2 = gimple_assign_rhs3 (def_stmt);
3251     }
3252   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3253     {
3254       if (dump_enabled_p ())
3255         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3256                         "reduction: not commutative/associative: ");
3257       return NULL;
3258     }
3259   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3260     {
3261       op1 = gimple_assign_rhs1 (def_stmt);
3262       op2 = gimple_assign_rhs2 (def_stmt);
3263     }
3264   else
3265     {
3266       if (dump_enabled_p ())
3267         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3268                         "reduction: not handled operation: ");
3269       return NULL;
3270     }
3271
3272   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3273     {
3274       if (dump_enabled_p ())
3275         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3276                         "reduction: both uses not ssa_names: ");
3277
3278       return NULL;
3279     }
3280
3281   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3282   if ((TREE_CODE (op1) == SSA_NAME
3283        && !types_compatible_p (type,TREE_TYPE (op1)))
3284       || (TREE_CODE (op2) == SSA_NAME
3285           && !types_compatible_p (type, TREE_TYPE (op2)))
3286       || (op3 && TREE_CODE (op3) == SSA_NAME
3287           && !types_compatible_p (type, TREE_TYPE (op3)))
3288       || (op4 && TREE_CODE (op4) == SSA_NAME
3289           && !types_compatible_p (type, TREE_TYPE (op4))))
3290     {
3291       if (dump_enabled_p ())
3292         {
3293           dump_printf_loc (MSG_NOTE, vect_location,
3294                            "reduction: multiple types: operation type: ");
3295           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3296           dump_printf (MSG_NOTE, ", operands types: ");
3297           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3298                              TREE_TYPE (op1));
3299           dump_printf (MSG_NOTE, ",");
3300           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3301                              TREE_TYPE (op2));
3302           if (op3)
3303             {
3304               dump_printf (MSG_NOTE, ",");
3305               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3306                                  TREE_TYPE (op3));
3307             }
3308
3309           if (op4)
3310             {
3311               dump_printf (MSG_NOTE, ",");
3312               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3313                                  TREE_TYPE (op4));
3314             }
3315           dump_printf (MSG_NOTE, "\n");
3316         }
3317
3318       return NULL;
3319     }
3320
3321   /* Check that it's ok to change the order of the computation.
3322      Generally, when vectorizing a reduction we change the order of the
3323      computation.  This may change the behavior of the program in some
3324      cases, so we need to check that this is ok.  One exception is when
3325      vectorizing an outer-loop: the inner-loop is executed sequentially,
3326      and therefore vectorizing reductions in the inner-loop during
3327      outer-loop vectorization is safe.  */
3328
3329   if (*v_reduc_type != COND_REDUCTION
3330       && check_reduction)
3331     {
3332       /* CHECKME: check for !flag_finite_math_only too?  */
3333       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3334         {
3335           /* Changing the order of operations changes the semantics.  */
3336           if (dump_enabled_p ())
3337             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3338                         "reduction: unsafe fp math optimization: ");
3339           return NULL;
3340         }
3341       else if (INTEGRAL_TYPE_P (type))
3342         {
3343           if (!operation_no_trapping_overflow (type, code))
3344             {
3345               /* Changing the order of operations changes the semantics.  */
3346               if (dump_enabled_p ())
3347                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3348                                 "reduction: unsafe int math optimization"
3349                                 " (overflow traps): ");
3350               return NULL;
3351             }
3352           if (need_wrapping_integral_overflow
3353               && !TYPE_OVERFLOW_WRAPS (type)
3354               && operation_can_overflow (code))
3355             {
3356               /* Changing the order of operations changes the semantics.  */
3357               if (dump_enabled_p ())
3358                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3359                                 "reduction: unsafe int math optimization"
3360                                 " (overflow doesn't wrap): ");
3361               return NULL;
3362             }
3363         }
3364       else if (SAT_FIXED_POINT_TYPE_P (type))
3365         {
3366           /* Changing the order of operations changes the semantics.  */
3367           if (dump_enabled_p ())
3368           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3369                           "reduction: unsafe fixed-point math optimization: ");
3370           return NULL;
3371         }
3372     }
3373
3374   /* Reduction is safe. We're dealing with one of the following:
3375      1) integer arithmetic and no trapv
3376      2) floating point arithmetic, and special flags permit this optimization
3377      3) nested cycle (i.e., outer loop vectorization).  */
3378   if (TREE_CODE (op1) == SSA_NAME)
3379     def1 = SSA_NAME_DEF_STMT (op1);
3380
3381   if (TREE_CODE (op2) == SSA_NAME)
3382     def2 = SSA_NAME_DEF_STMT (op2);
3383
3384   if (code != COND_EXPR
3385       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3386     {
3387       if (dump_enabled_p ())
3388         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3389       return NULL;
3390     }
3391
3392   /* Check that one def is the reduction def, defined by PHI,
3393      the other def is either defined in the loop ("vect_internal_def"),
3394      or it's an induction (defined by a loop-header phi-node).  */
3395
3396   if (def2 && def2 == phi
3397       && (code == COND_EXPR
3398           || !def1 || gimple_nop_p (def1)
3399           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3400           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3401               && (is_gimple_assign (def1)
3402                   || is_gimple_call (def1)
3403                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3404                       == vect_induction_def
3405                   || (gimple_code (def1) == GIMPLE_PHI
3406                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3407                           == vect_internal_def
3408                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3409     {
3410       if (dump_enabled_p ())
3411         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3412       return def_stmt;
3413     }
3414
3415   if (def1 && def1 == phi
3416       && (code == COND_EXPR
3417           || !def2 || gimple_nop_p (def2)
3418           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3419           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3420               && (is_gimple_assign (def2)
3421                   || is_gimple_call (def2)
3422                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3423                        == vect_induction_def
3424                   || (gimple_code (def2) == GIMPLE_PHI
3425                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3426                            == vect_internal_def
3427                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3428     {
3429       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3430         {
3431           /* Check if we can swap operands (just for simplicity - so that
3432              the rest of the code can assume that the reduction variable
3433              is always the last (second) argument).  */
3434           if (code == COND_EXPR)
3435             {
3436               /* Swap cond_expr by inverting the condition.  */
3437               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3438               enum tree_code invert_code = ERROR_MARK;
3439               enum tree_code cond_code = TREE_CODE (cond_expr);
3440
3441               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3442                 {
3443                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3444                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3445                 }
3446               if (invert_code != ERROR_MARK)
3447                 {
3448                   TREE_SET_CODE (cond_expr, invert_code);
3449                   swap_ssa_operands (def_stmt,
3450                                      gimple_assign_rhs2_ptr (def_stmt),
3451                                      gimple_assign_rhs3_ptr (def_stmt));
3452                 }
3453               else
3454                 {
3455                   if (dump_enabled_p ())
3456                     report_vect_op (MSG_NOTE, def_stmt,
3457                                     "detected reduction: cannot swap operands "
3458                                     "for cond_expr");
3459                   return NULL;
3460                 }
3461             }
3462           else
3463             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3464                                gimple_assign_rhs2_ptr (def_stmt));
3465
3466           if (dump_enabled_p ())
3467             report_vect_op (MSG_NOTE, def_stmt,
3468                             "detected reduction: need to swap operands: ");
3469
3470           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3471             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3472         }
3473       else
3474         {
3475           if (dump_enabled_p ())
3476             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3477         }
3478
3479       return def_stmt;
3480     }
3481
3482   /* Try to find SLP reduction chain.  */
3483   if (! nested_in_vect_loop
3484       && code != COND_EXPR
3485       && orig_code != MINUS_EXPR
3486       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3487     {
3488       if (dump_enabled_p ())
3489         report_vect_op (MSG_NOTE, def_stmt,
3490                         "reduction: detected reduction chain: ");
3491
3492       return def_stmt;
3493     }
3494
3495   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3496   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3497   while (first)
3498     {
3499       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3500       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3501       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3502       first = next;
3503     }
3504
3505   /* Look for the expression computing loop_arg from loop PHI result.  */
3506   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3507                             code))
3508     return def_stmt;
3509
3510   if (dump_enabled_p ())
3511     {
3512       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3513                       "reduction: unknown pattern: ");
3514     }
3515
3516   return NULL;
3517 }
3518
3519 /* Wrapper around vect_is_simple_reduction, which will modify code
3520    in-place if it enables detection of more reductions.  Arguments
3521    as there.  */
3522
3523 gimple *
3524 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3525                              bool *double_reduc,
3526                              bool need_wrapping_integral_overflow)
3527 {
3528   enum vect_reduction_type v_reduc_type;
3529   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3530                                           need_wrapping_integral_overflow,
3531                                           &v_reduc_type);
3532   if (def)
3533     {
3534       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3535       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3536       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3537       reduc_def_info = vinfo_for_stmt (def);
3538       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3539     }
3540   return def;
3541 }
3542
3543 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3544 int
3545 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3546                              int *peel_iters_epilogue,
3547                              stmt_vector_for_cost *scalar_cost_vec,
3548                              stmt_vector_for_cost *prologue_cost_vec,
3549                              stmt_vector_for_cost *epilogue_cost_vec)
3550 {
3551   int retval = 0;
3552   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3553
3554   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3555     {
3556       *peel_iters_epilogue = assumed_vf / 2;
3557       if (dump_enabled_p ())
3558         dump_printf_loc (MSG_NOTE, vect_location,
3559                          "cost model: epilogue peel iters set to vf/2 "
3560                          "because loop iterations are unknown .\n");
3561
3562       /* If peeled iterations are known but number of scalar loop
3563          iterations are unknown, count a taken branch per peeled loop.  */
3564       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3565                                  NULL, 0, vect_prologue);
3566       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3567                                  NULL, 0, vect_epilogue);
3568     }
3569   else
3570     {
3571       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3572       peel_iters_prologue = niters < peel_iters_prologue ?
3573                             niters : peel_iters_prologue;
3574       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3575       /* If we need to peel for gaps, but no peeling is required, we have to
3576          peel VF iterations.  */
3577       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3578         *peel_iters_epilogue = assumed_vf;
3579     }
3580
3581   stmt_info_for_cost *si;
3582   int j;
3583   if (peel_iters_prologue)
3584     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3585         {
3586           stmt_vec_info stmt_info
3587             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3588           retval += record_stmt_cost (prologue_cost_vec,
3589                                       si->count * peel_iters_prologue,
3590                                       si->kind, stmt_info, si->misalign,
3591                                       vect_prologue);
3592         }
3593   if (*peel_iters_epilogue)
3594     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3595         {
3596           stmt_vec_info stmt_info
3597             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3598           retval += record_stmt_cost (epilogue_cost_vec,
3599                                       si->count * *peel_iters_epilogue,
3600                                       si->kind, stmt_info, si->misalign,
3601                                       vect_epilogue);
3602         }
3603
3604   return retval;
3605 }
3606
3607 /* Function vect_estimate_min_profitable_iters
3608
3609    Return the number of iterations required for the vector version of the
3610    loop to be profitable relative to the cost of the scalar version of the
3611    loop.
3612
3613    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3614    of iterations for vectorization.  -1 value means loop vectorization
3615    is not profitable.  This returned value may be used for dynamic
3616    profitability check.
3617
3618    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3619    for static check against estimated number of iterations.  */
3620
3621 static void
3622 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3623                                     int *ret_min_profitable_niters,
3624                                     int *ret_min_profitable_estimate)
3625 {
3626   int min_profitable_iters;
3627   int min_profitable_estimate;
3628   int peel_iters_prologue;
3629   int peel_iters_epilogue;
3630   unsigned vec_inside_cost = 0;
3631   int vec_outside_cost = 0;
3632   unsigned vec_prologue_cost = 0;
3633   unsigned vec_epilogue_cost = 0;
3634   int scalar_single_iter_cost = 0;
3635   int scalar_outside_cost = 0;
3636   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3637   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3638   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3639
3640   /* Cost model disabled.  */
3641   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3642     {
3643       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3644       *ret_min_profitable_niters = 0;
3645       *ret_min_profitable_estimate = 0;
3646       return;
3647     }
3648
3649   /* Requires loop versioning tests to handle misalignment.  */
3650   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3651     {
3652       /*  FIXME: Make cost depend on complexity of individual check.  */
3653       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3654       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3655                             vect_prologue);
3656       dump_printf (MSG_NOTE,
3657                    "cost model: Adding cost of checks for loop "
3658                    "versioning to treat misalignment.\n");
3659     }
3660
3661   /* Requires loop versioning with alias checks.  */
3662   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3663     {
3664       /*  FIXME: Make cost depend on complexity of individual check.  */
3665       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3666       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3667                             vect_prologue);
3668       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3669       if (len)
3670         /* Count LEN - 1 ANDs and LEN comparisons.  */
3671         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3672                               NULL, 0, vect_prologue);
3673       dump_printf (MSG_NOTE,
3674                    "cost model: Adding cost of checks for loop "
3675                    "versioning aliasing.\n");
3676     }
3677
3678   /* Requires loop versioning with niter checks.  */
3679   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3680     {
3681       /*  FIXME: Make cost depend on complexity of individual check.  */
3682       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3683                             vect_prologue);
3684       dump_printf (MSG_NOTE,
3685                    "cost model: Adding cost of checks for loop "
3686                    "versioning niters.\n");
3687     }
3688
3689   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3690     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3691                           vect_prologue);
3692
3693   /* Count statements in scalar loop.  Using this as scalar cost for a single
3694      iteration for now.
3695
3696      TODO: Add outer loop support.
3697
3698      TODO: Consider assigning different costs to different scalar
3699      statements.  */
3700
3701   scalar_single_iter_cost
3702     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3703
3704   /* Add additional cost for the peeled instructions in prologue and epilogue
3705      loop.  (For fully-masked loops there will be no peeling.)
3706
3707      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3708      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3709
3710      TODO: Build an expression that represents peel_iters for prologue and
3711      epilogue to be used in a run-time test.  */
3712
3713   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3714     {
3715       peel_iters_prologue = 0;
3716       peel_iters_epilogue = 0;
3717     }
3718   else if (npeel < 0)
3719     {
3720       peel_iters_prologue = assumed_vf / 2;
3721       dump_printf (MSG_NOTE, "cost model: "
3722                    "prologue peel iters set to vf/2.\n");
3723
3724       /* If peeling for alignment is unknown, loop bound of main loop becomes
3725          unknown.  */
3726       peel_iters_epilogue = assumed_vf / 2;
3727       dump_printf (MSG_NOTE, "cost model: "
3728                    "epilogue peel iters set to vf/2 because "
3729                    "peeling for alignment is unknown.\n");
3730
3731       /* If peeled iterations are unknown, count a taken branch and a not taken
3732          branch per peeled loop. Even if scalar loop iterations are known,
3733          vector iterations are not known since peeled prologue iterations are
3734          not known. Hence guards remain the same.  */
3735       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3736                             NULL, 0, vect_prologue);
3737       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3738                             NULL, 0, vect_prologue);
3739       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3740                             NULL, 0, vect_epilogue);
3741       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3742                             NULL, 0, vect_epilogue);
3743       stmt_info_for_cost *si;
3744       int j;
3745       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3746         {
3747           struct _stmt_vec_info *stmt_info
3748             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3749           (void) add_stmt_cost (target_cost_data,
3750                                 si->count * peel_iters_prologue,
3751                                 si->kind, stmt_info, si->misalign,
3752                                 vect_prologue);
3753           (void) add_stmt_cost (target_cost_data,
3754                                 si->count * peel_iters_epilogue,
3755                                 si->kind, stmt_info, si->misalign,
3756                                 vect_epilogue);
3757         }
3758     }
3759   else
3760     {
3761       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3762       stmt_info_for_cost *si;
3763       int j;
3764       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3765
3766       prologue_cost_vec.create (2);
3767       epilogue_cost_vec.create (2);
3768       peel_iters_prologue = npeel;
3769
3770       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3771                                           &peel_iters_epilogue,
3772                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3773                                             (loop_vinfo),
3774                                           &prologue_cost_vec,
3775                                           &epilogue_cost_vec);
3776
3777       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3778         {
3779           struct _stmt_vec_info *stmt_info
3780             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3781           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3782                                 si->misalign, vect_prologue);
3783         }
3784
3785       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3786         {
3787           struct _stmt_vec_info *stmt_info
3788             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3789           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3790                                 si->misalign, vect_epilogue);
3791         }
3792
3793       prologue_cost_vec.release ();
3794       epilogue_cost_vec.release ();
3795     }
3796
3797   /* FORNOW: The scalar outside cost is incremented in one of the
3798      following ways:
3799
3800      1. The vectorizer checks for alignment and aliasing and generates
3801      a condition that allows dynamic vectorization.  A cost model
3802      check is ANDED with the versioning condition.  Hence scalar code
3803      path now has the added cost of the versioning check.
3804
3805        if (cost > th & versioning_check)
3806          jmp to vector code
3807
3808      Hence run-time scalar is incremented by not-taken branch cost.
3809
3810      2. The vectorizer then checks if a prologue is required.  If the
3811      cost model check was not done before during versioning, it has to
3812      be done before the prologue check.
3813
3814        if (cost <= th)
3815          prologue = scalar_iters
3816        if (prologue == 0)
3817          jmp to vector code
3818        else
3819          execute prologue
3820        if (prologue == num_iters)
3821          go to exit
3822
3823      Hence the run-time scalar cost is incremented by a taken branch,
3824      plus a not-taken branch, plus a taken branch cost.
3825
3826      3. The vectorizer then checks if an epilogue is required.  If the
3827      cost model check was not done before during prologue check, it
3828      has to be done with the epilogue check.
3829
3830        if (prologue == 0)
3831          jmp to vector code
3832        else
3833          execute prologue
3834        if (prologue == num_iters)
3835          go to exit
3836        vector code:
3837          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3838            jmp to epilogue
3839
3840      Hence the run-time scalar cost should be incremented by 2 taken
3841      branches.
3842
3843      TODO: The back end may reorder the BBS's differently and reverse
3844      conditions/branch directions.  Change the estimates below to
3845      something more reasonable.  */
3846
3847   /* If the number of iterations is known and we do not do versioning, we can
3848      decide whether to vectorize at compile time.  Hence the scalar version
3849      do not carry cost model guard costs.  */
3850   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3851       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3852     {
3853       /* Cost model check occurs at versioning.  */
3854       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3855         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3856       else
3857         {
3858           /* Cost model check occurs at prologue generation.  */
3859           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3860             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3861               + vect_get_stmt_cost (cond_branch_not_taken);
3862           /* Cost model check occurs at epilogue generation.  */
3863           else
3864             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3865         }
3866     }
3867
3868   /* Complete the target-specific cost calculations.  */
3869   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3870                &vec_inside_cost, &vec_epilogue_cost);
3871
3872   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3873
3874   if (dump_enabled_p ())
3875     {
3876       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3877       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3878                    vec_inside_cost);
3879       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3880                    vec_prologue_cost);
3881       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3882                    vec_epilogue_cost);
3883       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3884                    scalar_single_iter_cost);
3885       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3886                    scalar_outside_cost);
3887       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3888                    vec_outside_cost);
3889       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3890                    peel_iters_prologue);
3891       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3892                    peel_iters_epilogue);
3893     }
3894
3895   /* Calculate number of iterations required to make the vector version
3896      profitable, relative to the loop bodies only.  The following condition
3897      must hold true:
3898      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3899      where
3900      SIC = scalar iteration cost, VIC = vector iteration cost,
3901      VOC = vector outside cost, VF = vectorization factor,
3902      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3903      SOC = scalar outside cost for run time cost model check.  */
3904
3905   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3906     {
3907       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3908                               * assumed_vf
3909                               - vec_inside_cost * peel_iters_prologue
3910                               - vec_inside_cost * peel_iters_epilogue);
3911       if (min_profitable_iters <= 0)
3912         min_profitable_iters = 0;
3913       else
3914         {
3915           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3916                                    - vec_inside_cost);
3917
3918           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3919               <= (((int) vec_inside_cost * min_profitable_iters)
3920                   + (((int) vec_outside_cost - scalar_outside_cost)
3921                      * assumed_vf)))
3922             min_profitable_iters++;
3923         }
3924     }
3925   /* vector version will never be profitable.  */
3926   else
3927     {
3928       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3929         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3930                     "did not happen for a simd loop");
3931
3932       if (dump_enabled_p ())
3933         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3934                          "cost model: the vector iteration cost = %d "
3935                          "divided by the scalar iteration cost = %d "
3936                          "is greater or equal to the vectorization factor = %d"
3937                          ".\n",
3938                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3939       *ret_min_profitable_niters = -1;
3940       *ret_min_profitable_estimate = -1;
3941       return;
3942     }
3943
3944   dump_printf (MSG_NOTE,
3945                "  Calculated minimum iters for profitability: %d\n",
3946                min_profitable_iters);
3947
3948   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3949       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3950     /* We want the vectorized loop to execute at least once.  */
3951     min_profitable_iters = assumed_vf + peel_iters_prologue;
3952
3953   if (dump_enabled_p ())
3954     dump_printf_loc (MSG_NOTE, vect_location,
3955                      "  Runtime profitability threshold = %d\n",
3956                      min_profitable_iters);
3957
3958   *ret_min_profitable_niters = min_profitable_iters;
3959
3960   /* Calculate number of iterations required to make the vector version
3961      profitable, relative to the loop bodies only.
3962
3963      Non-vectorized variant is SIC * niters and it must win over vector
3964      variant on the expected loop trip count.  The following condition must hold true:
3965      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3966
3967   if (vec_outside_cost <= 0)
3968     min_profitable_estimate = 0;
3969   else
3970     {
3971       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3972                                  * assumed_vf
3973                                  - vec_inside_cost * peel_iters_prologue
3974                                  - vec_inside_cost * peel_iters_epilogue)
3975                                  / ((scalar_single_iter_cost * assumed_vf)
3976                                    - vec_inside_cost);
3977     }
3978   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3979   if (dump_enabled_p ())
3980     dump_printf_loc (MSG_NOTE, vect_location,
3981                      "  Static estimate profitability threshold = %d\n",
3982                      min_profitable_estimate);
3983
3984   *ret_min_profitable_estimate = min_profitable_estimate;
3985 }
3986
3987 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3988    vector elements (not bits) for a vector with NELT elements.  */
3989 static void
3990 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3991                               vec_perm_builder *sel)
3992 {
3993   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3994      by vec_perm_indices.  */
3995   sel->new_vector (nelt, 1, 3);
3996   for (unsigned int i = 0; i < 3; i++)
3997     sel->quick_push (i + offset);
3998 }
3999
4000 /* Checks whether the target supports whole-vector shifts for vectors of mode
4001    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4002    it supports vec_perm_const with masks for all necessary shift amounts.  */
4003 static bool
4004 have_whole_vector_shift (machine_mode mode)
4005 {
4006   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4007     return true;
4008
4009   /* Variable-length vectors should be handled via the optab.  */
4010   unsigned int nelt;
4011   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4012     return false;
4013
4014   vec_perm_builder sel;
4015   vec_perm_indices indices;
4016   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4017     {
4018       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4019       indices.new_vector (sel, 2, nelt);
4020       if (!can_vec_perm_const_p (mode, indices, false))
4021         return false;
4022     }
4023   return true;
4024 }
4025
4026 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4027    functions. Design better to avoid maintenance issues.  */
4028
4029 /* Function vect_model_reduction_cost.
4030
4031    Models cost for a reduction operation, including the vector ops
4032    generated within the strip-mine loop, the initial definition before
4033    the loop, and the epilogue code that must be generated.  */
4034
4035 static void
4036 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4037                            int ncopies)
4038 {
4039   int prologue_cost = 0, epilogue_cost = 0;
4040   enum tree_code code;
4041   optab optab;
4042   tree vectype;
4043   gimple *orig_stmt;
4044   machine_mode mode;
4045   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4046   struct loop *loop = NULL;
4047   void *target_cost_data;
4048
4049   if (loop_vinfo)
4050     {
4051       loop = LOOP_VINFO_LOOP (loop_vinfo);
4052       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4053     }
4054   else
4055     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4056
4057   /* Condition reductions generate two reductions in the loop.  */
4058   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4059     ncopies *= 2;
4060
4061   /* Cost of reduction op inside loop.  */
4062   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4063                                         stmt_info, 0, vect_body);
4064
4065   vectype = STMT_VINFO_VECTYPE (stmt_info);
4066   mode = TYPE_MODE (vectype);
4067   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4068
4069   if (!orig_stmt)
4070     orig_stmt = STMT_VINFO_STMT (stmt_info);
4071
4072   code = gimple_assign_rhs_code (orig_stmt);
4073
4074   /* Add in cost for initial definition.
4075      For cond reduction we have four vectors: initial index, step, initial
4076      result of the data reduction, initial value of the index reduction.  */
4077   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4078                        == COND_REDUCTION ? 4 : 1;
4079   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4080                                   scalar_to_vec, stmt_info, 0,
4081                                   vect_prologue);
4082
4083   /* Determine cost of epilogue code.
4084
4085      We have a reduction operator that will reduce the vector in one statement.
4086      Also requires scalar extract.  */
4087
4088   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4089     {
4090       if (reduc_fn != IFN_LAST)
4091         {
4092           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4093             {
4094               /* An EQ stmt and an COND_EXPR stmt.  */
4095               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4096                                               vector_stmt, stmt_info, 0,
4097                                               vect_epilogue);
4098               /* Reduction of the max index and a reduction of the found
4099                  values.  */
4100               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4101                                               vec_to_scalar, stmt_info, 0,
4102                                               vect_epilogue);
4103               /* A broadcast of the max value.  */
4104               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4105                                               scalar_to_vec, stmt_info, 0,
4106                                               vect_epilogue);
4107             }
4108           else
4109             {
4110               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4111                                               stmt_info, 0, vect_epilogue);
4112               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4113                                               vec_to_scalar, stmt_info, 0,
4114                                               vect_epilogue);
4115             }
4116         }
4117       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4118         {
4119           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4120           /* Extraction of scalar elements.  */
4121           epilogue_cost += add_stmt_cost (target_cost_data,
4122                                           2 * estimated_nunits,
4123                                           vec_to_scalar, stmt_info, 0,
4124                                           vect_epilogue);
4125           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4126           epilogue_cost += add_stmt_cost (target_cost_data,
4127                                           2 * estimated_nunits - 3,
4128                                           scalar_stmt, stmt_info, 0,
4129                                           vect_epilogue);
4130         }
4131       else
4132         {
4133           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4134           tree bitsize =
4135             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4136           int element_bitsize = tree_to_uhwi (bitsize);
4137           int nelements = vec_size_in_bits / element_bitsize;
4138
4139           if (code == COND_EXPR)
4140             code = MAX_EXPR;
4141
4142           optab = optab_for_tree_code (code, vectype, optab_default);
4143
4144           /* We have a whole vector shift available.  */
4145           if (optab != unknown_optab
4146               && VECTOR_MODE_P (mode)
4147               && optab_handler (optab, mode) != CODE_FOR_nothing
4148               && have_whole_vector_shift (mode))
4149             {
4150               /* Final reduction via vector shifts and the reduction operator.
4151                  Also requires scalar extract.  */
4152               epilogue_cost += add_stmt_cost (target_cost_data,
4153                                               exact_log2 (nelements) * 2,
4154                                               vector_stmt, stmt_info, 0,
4155                                               vect_epilogue);
4156               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4157                                               vec_to_scalar, stmt_info, 0,
4158                                               vect_epilogue);
4159             }
4160           else
4161             /* Use extracts and reduction op for final reduction.  For N
4162                elements, we have N extracts and N-1 reduction ops.  */
4163             epilogue_cost += add_stmt_cost (target_cost_data,
4164                                             nelements + nelements - 1,
4165                                             vector_stmt, stmt_info, 0,
4166                                             vect_epilogue);
4167         }
4168     }
4169
4170   if (dump_enabled_p ())
4171     dump_printf (MSG_NOTE,
4172                  "vect_model_reduction_cost: inside_cost = %d, "
4173                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4174                  prologue_cost, epilogue_cost);
4175 }
4176
4177
4178 /* Function vect_model_induction_cost.
4179
4180    Models cost for induction operations.  */
4181
4182 static void
4183 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4184 {
4185   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4186   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4187   unsigned inside_cost, prologue_cost;
4188
4189   if (PURE_SLP_STMT (stmt_info))
4190     return;
4191
4192   /* loop cost for vec_loop.  */
4193   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4194                                stmt_info, 0, vect_body);
4195
4196   /* prologue cost for vec_init and vec_step.  */
4197   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4198                                  stmt_info, 0, vect_prologue);
4199
4200   if (dump_enabled_p ())
4201     dump_printf_loc (MSG_NOTE, vect_location,
4202                      "vect_model_induction_cost: inside_cost = %d, "
4203                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4204 }
4205
4206
4207
4208 /* Function get_initial_def_for_reduction
4209
4210    Input:
4211    STMT - a stmt that performs a reduction operation in the loop.
4212    INIT_VAL - the initial value of the reduction variable
4213
4214    Output:
4215    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4216         of the reduction (used for adjusting the epilog - see below).
4217    Return a vector variable, initialized according to the operation that STMT
4218         performs. This vector will be used as the initial value of the
4219         vector of partial results.
4220
4221    Option1 (adjust in epilog): Initialize the vector as follows:
4222      add/bit or/xor:    [0,0,...,0,0]
4223      mult/bit and:      [1,1,...,1,1]
4224      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4225    and when necessary (e.g. add/mult case) let the caller know
4226    that it needs to adjust the result by init_val.
4227
4228    Option2: Initialize the vector as follows:
4229      add/bit or/xor:    [init_val,0,0,...,0]
4230      mult/bit and:      [init_val,1,1,...,1]
4231      min/max/cond_expr: [init_val,init_val,...,init_val]
4232    and no adjustments are needed.
4233
4234    For example, for the following code:
4235
4236    s = init_val;
4237    for (i=0;i<n;i++)
4238      s = s + a[i];
4239
4240    STMT is 's = s + a[i]', and the reduction variable is 's'.
4241    For a vector of 4 units, we want to return either [0,0,0,init_val],
4242    or [0,0,0,0] and let the caller know that it needs to adjust
4243    the result at the end by 'init_val'.
4244
4245    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4246    initialization vector is simpler (same element in all entries), if
4247    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4248
4249    A cost model should help decide between these two schemes.  */
4250
4251 tree
4252 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4253                                tree *adjustment_def)
4254 {
4255   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4256   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4257   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4258   tree scalar_type = TREE_TYPE (init_val);
4259   tree vectype = get_vectype_for_scalar_type (scalar_type);
4260   enum tree_code code = gimple_assign_rhs_code (stmt);
4261   tree def_for_init;
4262   tree init_def;
4263   bool nested_in_vect_loop = false;
4264   REAL_VALUE_TYPE real_init_val = dconst0;
4265   int int_init_val = 0;
4266   gimple *def_stmt = NULL;
4267   gimple_seq stmts = NULL;
4268
4269   gcc_assert (vectype);
4270
4271   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4272               || SCALAR_FLOAT_TYPE_P (scalar_type));
4273
4274   if (nested_in_vect_loop_p (loop, stmt))
4275     nested_in_vect_loop = true;
4276   else
4277     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4278
4279   /* In case of double reduction we only create a vector variable to be put
4280      in the reduction phi node.  The actual statement creation is done in
4281      vect_create_epilog_for_reduction.  */
4282   if (adjustment_def && nested_in_vect_loop
4283       && TREE_CODE (init_val) == SSA_NAME
4284       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4285       && gimple_code (def_stmt) == GIMPLE_PHI
4286       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4287       && vinfo_for_stmt (def_stmt)
4288       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4289           == vect_double_reduction_def)
4290     {
4291       *adjustment_def = NULL;
4292       return vect_create_destination_var (init_val, vectype);
4293     }
4294
4295   /* In case of a nested reduction do not use an adjustment def as
4296      that case is not supported by the epilogue generation correctly
4297      if ncopies is not one.  */
4298   if (adjustment_def && nested_in_vect_loop)
4299     {
4300       *adjustment_def = NULL;
4301       return vect_get_vec_def_for_operand (init_val, stmt);
4302     }
4303
4304   switch (code)
4305     {
4306     case WIDEN_SUM_EXPR:
4307     case DOT_PROD_EXPR:
4308     case SAD_EXPR:
4309     case PLUS_EXPR:
4310     case MINUS_EXPR:
4311     case BIT_IOR_EXPR:
4312     case BIT_XOR_EXPR:
4313     case MULT_EXPR:
4314     case BIT_AND_EXPR:
4315       {
4316         /* ADJUSTMENT_DEF is NULL when called from
4317            vect_create_epilog_for_reduction to vectorize double reduction.  */
4318         if (adjustment_def)
4319           *adjustment_def = init_val;
4320
4321         if (code == MULT_EXPR)
4322           {
4323             real_init_val = dconst1;
4324             int_init_val = 1;
4325           }
4326
4327         if (code == BIT_AND_EXPR)
4328           int_init_val = -1;
4329
4330         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4331           def_for_init = build_real (scalar_type, real_init_val);
4332         else
4333           def_for_init = build_int_cst (scalar_type, int_init_val);
4334
4335         if (adjustment_def)
4336           /* Option1: the first element is '0' or '1' as well.  */
4337           init_def = gimple_build_vector_from_val (&stmts, vectype,
4338                                                    def_for_init);
4339         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4340           {
4341             /* Option2 (variable length): the first element is INIT_VAL.  */
4342             init_def = build_vector_from_val (vectype, def_for_init);
4343             gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4344                                                       2, init_def, init_val);
4345             init_def = make_ssa_name (vectype);
4346             gimple_call_set_lhs (call, init_def);
4347             gimple_seq_add_stmt (&stmts, call);
4348           }
4349         else
4350           {
4351             /* Option2: the first element is INIT_VAL.  */
4352             tree_vector_builder elts (vectype, 1, 2);
4353             elts.quick_push (init_val);
4354             elts.quick_push (def_for_init);
4355             init_def = gimple_build_vector (&stmts, &elts);
4356           }
4357       }
4358       break;
4359
4360     case MIN_EXPR:
4361     case MAX_EXPR:
4362     case COND_EXPR:
4363       {
4364         if (adjustment_def)
4365           {
4366             *adjustment_def = NULL_TREE;
4367             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4368               {
4369                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4370                 break;
4371               }
4372           }
4373         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4374         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4375       }
4376       break;
4377
4378     default:
4379       gcc_unreachable ();
4380     }
4381
4382   if (stmts)
4383     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4384   return init_def;
4385 }
4386
4387 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4388    NUMBER_OF_VECTORS is the number of vector defs to create.
4389    If NEUTRAL_OP is nonnull, introducing extra elements of that
4390    value will not change the result.  */
4391
4392 static void
4393 get_initial_defs_for_reduction (slp_tree slp_node,
4394                                 vec<tree> *vec_oprnds,
4395                                 unsigned int number_of_vectors,
4396                                 bool reduc_chain, tree neutral_op)
4397 {
4398   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4399   gimple *stmt = stmts[0];
4400   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4401   unsigned HOST_WIDE_INT nunits;
4402   unsigned j, number_of_places_left_in_vector;
4403   tree vector_type;
4404   tree vop;
4405   int group_size = stmts.length ();
4406   unsigned int vec_num, i;
4407   unsigned number_of_copies = 1;
4408   vec<tree> voprnds;
4409   voprnds.create (number_of_vectors);
4410   struct loop *loop;
4411   auto_vec<tree, 16> permute_results;
4412
4413   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4414
4415   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4416
4417   loop = (gimple_bb (stmt))->loop_father;
4418   gcc_assert (loop);
4419   edge pe = loop_preheader_edge (loop);
4420
4421   gcc_assert (!reduc_chain || neutral_op);
4422
4423   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4424      created vectors. It is greater than 1 if unrolling is performed.
4425
4426      For example, we have two scalar operands, s1 and s2 (e.g., group of
4427      strided accesses of size two), while NUNITS is four (i.e., four scalars
4428      of this type can be packed in a vector).  The output vector will contain
4429      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4430      will be 2).
4431
4432      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4433      containing the operands.
4434
4435      For example, NUNITS is four as before, and the group size is 8
4436      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4437      {s5, s6, s7, s8}.  */
4438
4439   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4440     nunits = group_size;
4441
4442   number_of_copies = nunits * number_of_vectors / group_size;
4443
4444   number_of_places_left_in_vector = nunits;
4445   bool constant_p = true;
4446   tree_vector_builder elts (vector_type, nunits, 1);
4447   elts.quick_grow (nunits);
4448   for (j = 0; j < number_of_copies; j++)
4449     {
4450       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4451         {
4452           tree op;
4453           /* Get the def before the loop.  In reduction chain we have only
4454              one initial value.  */
4455           if ((j != (number_of_copies - 1)
4456                || (reduc_chain && i != 0))
4457               && neutral_op)
4458             op = neutral_op;
4459           else
4460             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4461
4462           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4463           number_of_places_left_in_vector--;
4464           elts[number_of_places_left_in_vector] = op;
4465           if (!CONSTANT_CLASS_P (op))
4466             constant_p = false;
4467
4468           if (number_of_places_left_in_vector == 0)
4469             {
4470               gimple_seq ctor_seq = NULL;
4471               tree init;
4472               if (constant_p && !neutral_op
4473                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4474                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4475                 /* Build the vector directly from ELTS.  */
4476                 init = gimple_build_vector (&ctor_seq, &elts);
4477               else if (neutral_op)
4478                 {
4479                   /* Build a vector of the neutral value and shift the
4480                      other elements into place.  */
4481                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4482                                                        neutral_op);
4483                   int k = nunits;
4484                   while (k > 0 && elts[k - 1] == neutral_op)
4485                     k -= 1;
4486                   while (k > 0)
4487                     {
4488                       k -= 1;
4489                       gcall *call = gimple_build_call_internal
4490                         (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4491                       init = make_ssa_name (vector_type);
4492                       gimple_call_set_lhs (call, init);
4493                       gimple_seq_add_stmt (&ctor_seq, call);
4494                     }
4495                 }
4496               else
4497                 {
4498                   /* First time round, duplicate ELTS to fill the
4499                      required number of vectors, then cherry pick the
4500                      appropriate result for each iteration.  */
4501                   if (vec_oprnds->is_empty ())
4502                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4503                                               number_of_vectors,
4504                                               permute_results);
4505                   init = permute_results[number_of_vectors - j - 1];
4506                 }
4507               if (ctor_seq != NULL)
4508                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4509               voprnds.quick_push (init);
4510
4511               number_of_places_left_in_vector = nunits;
4512               elts.new_vector (vector_type, nunits, 1);
4513               elts.quick_grow (nunits);
4514               constant_p = true;
4515             }
4516         }
4517     }
4518
4519   /* Since the vectors are created in the reverse order, we should invert
4520      them.  */
4521   vec_num = voprnds.length ();
4522   for (j = vec_num; j != 0; j--)
4523     {
4524       vop = voprnds[j - 1];
4525       vec_oprnds->quick_push (vop);
4526     }
4527
4528   voprnds.release ();
4529
4530   /* In case that VF is greater than the unrolling factor needed for the SLP
4531      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4532      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4533      to replicate the vectors.  */
4534   tree neutral_vec = NULL;
4535   while (number_of_vectors > vec_oprnds->length ())
4536     {
4537       if (neutral_op)
4538         {
4539           if (!neutral_vec)
4540             {
4541               gimple_seq ctor_seq = NULL;
4542               neutral_vec = gimple_build_vector_from_val
4543                 (&ctor_seq, vector_type, neutral_op);
4544               if (ctor_seq != NULL)
4545                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4546             }
4547           vec_oprnds->quick_push (neutral_vec);
4548         }
4549       else
4550         {
4551           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4552             vec_oprnds->quick_push (vop);
4553         }
4554     }
4555 }
4556
4557
4558 /* Function vect_create_epilog_for_reduction
4559
4560    Create code at the loop-epilog to finalize the result of a reduction
4561    computation.
4562
4563    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4564      reduction statements.
4565    STMT is the scalar reduction stmt that is being vectorized.
4566    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4567      number of elements that we can fit in a vectype (nunits).  In this case
4568      we have to generate more than one vector stmt - i.e - we need to "unroll"
4569      the vector stmt by a factor VF/nunits.  For more details see documentation
4570      in vectorizable_operation.
4571    REDUC_FN is the internal function for the epilog reduction.
4572    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4573      computation.
4574    REDUC_INDEX is the index of the operand in the right hand side of the
4575      statement that is defined by REDUCTION_PHI.
4576    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4577    SLP_NODE is an SLP node containing a group of reduction statements. The
4578      first one in this group is STMT.
4579    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4580      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4581      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4582      any value of the IV in the loop.
4583    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4584    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4585      null if this is not an SLP reduction
4586
4587    This function:
4588    1. Creates the reduction def-use cycles: sets the arguments for
4589       REDUCTION_PHIS:
4590       The loop-entry argument is the vectorized initial-value of the reduction.
4591       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4592       sums.
4593    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4594       by calling the function specified by REDUC_FN if available, or by
4595       other means (whole-vector shifts or a scalar loop).
4596       The function also creates a new phi node at the loop exit to preserve
4597       loop-closed form, as illustrated below.
4598
4599      The flow at the entry to this function:
4600
4601         loop:
4602           vec_def = phi <null, null>            # REDUCTION_PHI
4603           VECT_DEF = vector_stmt                # vectorized form of STMT
4604           s_loop = scalar_stmt                  # (scalar) STMT
4605         loop_exit:
4606           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4607           use <s_out0>
4608           use <s_out0>
4609
4610      The above is transformed by this function into:
4611
4612         loop:
4613           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4614           VECT_DEF = vector_stmt                # vectorized form of STMT
4615           s_loop = scalar_stmt                  # (scalar) STMT
4616         loop_exit:
4617           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4618           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4619           v_out2 = reduce <v_out1>
4620           s_out3 = extract_field <v_out2, 0>
4621           s_out4 = adjust_result <s_out3>
4622           use <s_out4>
4623           use <s_out4>
4624 */
4625
4626 static void
4627 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4628                                   gimple *reduc_def_stmt,
4629                                   int ncopies, internal_fn reduc_fn,
4630                                   vec<gimple *> reduction_phis,
4631                                   bool double_reduc,
4632                                   slp_tree slp_node,
4633                                   slp_instance slp_node_instance,
4634                                   tree induc_val, enum tree_code induc_code,
4635                                   tree neutral_op)
4636 {
4637   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4638   stmt_vec_info prev_phi_info;
4639   tree vectype;
4640   machine_mode mode;
4641   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4642   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4643   basic_block exit_bb;
4644   tree scalar_dest;
4645   tree scalar_type;
4646   gimple *new_phi = NULL, *phi;
4647   gimple_stmt_iterator exit_gsi;
4648   tree vec_dest;
4649   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4650   gimple *epilog_stmt = NULL;
4651   enum tree_code code = gimple_assign_rhs_code (stmt);
4652   gimple *exit_phi;
4653   tree bitsize;
4654   tree adjustment_def = NULL;
4655   tree vec_initial_def = NULL;
4656   tree expr, def, initial_def = NULL;
4657   tree orig_name, scalar_result;
4658   imm_use_iterator imm_iter, phi_imm_iter;
4659   use_operand_p use_p, phi_use_p;
4660   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4661   bool nested_in_vect_loop = false;
4662   auto_vec<gimple *> new_phis;
4663   auto_vec<gimple *> inner_phis;
4664   enum vect_def_type dt = vect_unknown_def_type;
4665   int j, i;
4666   auto_vec<tree> scalar_results;
4667   unsigned int group_size = 1, k, ratio;
4668   auto_vec<tree> vec_initial_defs;
4669   auto_vec<gimple *> phis;
4670   bool slp_reduc = false;
4671   bool direct_slp_reduc;
4672   tree new_phi_result;
4673   gimple *inner_phi = NULL;
4674   tree induction_index = NULL_TREE;
4675
4676   if (slp_node)
4677     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4678
4679   if (nested_in_vect_loop_p (loop, stmt))
4680     {
4681       outer_loop = loop;
4682       loop = loop->inner;
4683       nested_in_vect_loop = true;
4684       gcc_assert (!slp_node);
4685     }
4686
4687   vectype = STMT_VINFO_VECTYPE (stmt_info);
4688   gcc_assert (vectype);
4689   mode = TYPE_MODE (vectype);
4690
4691   /* 1. Create the reduction def-use cycle:
4692      Set the arguments of REDUCTION_PHIS, i.e., transform
4693
4694         loop:
4695           vec_def = phi <null, null>            # REDUCTION_PHI
4696           VECT_DEF = vector_stmt                # vectorized form of STMT
4697           ...
4698
4699      into:
4700
4701         loop:
4702           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4703           VECT_DEF = vector_stmt                # vectorized form of STMT
4704           ...
4705
4706      (in case of SLP, do it for all the phis). */
4707
4708   /* Get the loop-entry arguments.  */
4709   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4710   if (slp_node)
4711     {
4712       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4713       vec_initial_defs.reserve (vec_num);
4714       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4715                                       &vec_initial_defs, vec_num,
4716                                       GROUP_FIRST_ELEMENT (stmt_info),
4717                                       neutral_op);
4718     }
4719   else
4720     {
4721       /* Get at the scalar def before the loop, that defines the initial value
4722          of the reduction variable.  */
4723       gimple *def_stmt;
4724       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4725                                            loop_preheader_edge (loop));
4726       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4727          and we can't use zero for induc_val, use initial_def.  Similarly
4728          for REDUC_MIN and initial_def larger than the base.  */
4729       if (TREE_CODE (initial_def) == INTEGER_CST
4730           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4731               == INTEGER_INDUC_COND_REDUCTION)
4732           && !integer_zerop (induc_val)
4733           && ((induc_code == MAX_EXPR
4734                && tree_int_cst_lt (initial_def, induc_val))
4735               || (induc_code == MIN_EXPR
4736                   && tree_int_cst_lt (induc_val, initial_def))))
4737         induc_val = initial_def;
4738       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4739       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4740                                                        &adjustment_def);
4741       vec_initial_defs.create (1);
4742       vec_initial_defs.quick_push (vec_initial_def);
4743     }
4744
4745   /* Set phi nodes arguments.  */
4746   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4747     {
4748       tree vec_init_def = vec_initial_defs[i];
4749       tree def = vect_defs[i];
4750       for (j = 0; j < ncopies; j++)
4751         {
4752           if (j != 0)
4753             {
4754               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4755               if (nested_in_vect_loop)
4756                 vec_init_def
4757                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4758                                                     vec_init_def);
4759             }
4760
4761           /* Set the loop-entry arg of the reduction-phi.  */
4762
4763           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4764               == INTEGER_INDUC_COND_REDUCTION)
4765             {
4766               /* Initialise the reduction phi to zero.  This prevents initial
4767                  values of non-zero interferring with the reduction op.  */
4768               gcc_assert (ncopies == 1);
4769               gcc_assert (i == 0);
4770
4771               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4772               tree induc_val_vec
4773                 = build_vector_from_val (vec_init_def_type, induc_val);
4774
4775               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4776                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4777             }
4778           else
4779             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4780                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4781
4782           /* Set the loop-latch arg for the reduction-phi.  */
4783           if (j > 0)
4784             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4785
4786           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4787                        UNKNOWN_LOCATION);
4788
4789           if (dump_enabled_p ())
4790             {
4791               dump_printf_loc (MSG_NOTE, vect_location,
4792                                "transform reduction: created def-use cycle: ");
4793               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4794               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4795             }
4796         }
4797     }
4798
4799   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4800      which is updated with the current index of the loop for every match of
4801      the original loop's cond_expr (VEC_STMT).  This results in a vector
4802      containing the last time the condition passed for that vector lane.
4803      The first match will be a 1 to allow 0 to be used for non-matching
4804      indexes.  If there are no matches at all then the vector will be all
4805      zeroes.  */
4806   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4807     {
4808       tree indx_before_incr, indx_after_incr;
4809       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4810
4811       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4812       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4813
4814       int scalar_precision
4815         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4816       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4817       tree cr_index_vector_type = build_vector_type
4818         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4819
4820       /* First we create a simple vector induction variable which starts
4821          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4822          vector size (STEP).  */
4823
4824       /* Create a {1,2,3,...} vector.  */
4825       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4826
4827       /* Create a vector of the step value.  */
4828       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4829       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4830
4831       /* Create an induction variable.  */
4832       gimple_stmt_iterator incr_gsi;
4833       bool insert_after;
4834       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4835       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4836                  insert_after, &indx_before_incr, &indx_after_incr);
4837
4838       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4839          filled with zeros (VEC_ZERO).  */
4840
4841       /* Create a vector of 0s.  */
4842       tree zero = build_zero_cst (cr_index_scalar_type);
4843       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4844
4845       /* Create a vector phi node.  */
4846       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4847       new_phi = create_phi_node (new_phi_tree, loop->header);
4848       set_vinfo_for_stmt (new_phi,
4849                           new_stmt_vec_info (new_phi, loop_vinfo));
4850       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4851                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4852
4853       /* Now take the condition from the loops original cond_expr
4854          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4855          every match uses values from the induction variable
4856          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4857          (NEW_PHI_TREE).
4858          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4859          the new cond_expr (INDEX_COND_EXPR).  */
4860
4861       /* Duplicate the condition from vec_stmt.  */
4862       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4863
4864       /* Create a conditional, where the condition is taken from vec_stmt
4865          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4866          else is the phi (NEW_PHI_TREE).  */
4867       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4868                                      ccompare, indx_before_incr,
4869                                      new_phi_tree);
4870       induction_index = make_ssa_name (cr_index_vector_type);
4871       gimple *index_condition = gimple_build_assign (induction_index,
4872                                                      index_cond_expr);
4873       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4874       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4875                                                         loop_vinfo);
4876       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4877       set_vinfo_for_stmt (index_condition, index_vec_info);
4878
4879       /* Update the phi with the vec cond.  */
4880       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4881                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4882     }
4883
4884   /* 2. Create epilog code.
4885         The reduction epilog code operates across the elements of the vector
4886         of partial results computed by the vectorized loop.
4887         The reduction epilog code consists of:
4888
4889         step 1: compute the scalar result in a vector (v_out2)
4890         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4891         step 3: adjust the scalar result (s_out3) if needed.
4892
4893         Step 1 can be accomplished using one the following three schemes:
4894           (scheme 1) using reduc_fn, if available.
4895           (scheme 2) using whole-vector shifts, if available.
4896           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4897                      combined.
4898
4899           The overall epilog code looks like this:
4900
4901           s_out0 = phi <s_loop>         # original EXIT_PHI
4902           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4903           v_out2 = reduce <v_out1>              # step 1
4904           s_out3 = extract_field <v_out2, 0>    # step 2
4905           s_out4 = adjust_result <s_out3>       # step 3
4906
4907           (step 3 is optional, and steps 1 and 2 may be combined).
4908           Lastly, the uses of s_out0 are replaced by s_out4.  */
4909
4910
4911   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4912          v_out1 = phi <VECT_DEF>
4913          Store them in NEW_PHIS.  */
4914
4915   exit_bb = single_exit (loop)->dest;
4916   prev_phi_info = NULL;
4917   new_phis.create (vect_defs.length ());
4918   FOR_EACH_VEC_ELT (vect_defs, i, def)
4919     {
4920       for (j = 0; j < ncopies; j++)
4921         {
4922           tree new_def = copy_ssa_name (def);
4923           phi = create_phi_node (new_def, exit_bb);
4924           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4925           if (j == 0)
4926             new_phis.quick_push (phi);
4927           else
4928             {
4929               def = vect_get_vec_def_for_stmt_copy (dt, def);
4930               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4931             }
4932
4933           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4934           prev_phi_info = vinfo_for_stmt (phi);
4935         }
4936     }
4937
4938   /* The epilogue is created for the outer-loop, i.e., for the loop being
4939      vectorized.  Create exit phis for the outer loop.  */
4940   if (double_reduc)
4941     {
4942       loop = outer_loop;
4943       exit_bb = single_exit (loop)->dest;
4944       inner_phis.create (vect_defs.length ());
4945       FOR_EACH_VEC_ELT (new_phis, i, phi)
4946         {
4947           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4948           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4949           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4950                            PHI_RESULT (phi));
4951           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4952                                                             loop_vinfo));
4953           inner_phis.quick_push (phi);
4954           new_phis[i] = outer_phi;
4955           prev_phi_info = vinfo_for_stmt (outer_phi);
4956           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4957             {
4958               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4959               new_result = copy_ssa_name (PHI_RESULT (phi));
4960               outer_phi = create_phi_node (new_result, exit_bb);
4961               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4962                                PHI_RESULT (phi));
4963               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4964                                                                 loop_vinfo));
4965               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4966               prev_phi_info = vinfo_for_stmt (outer_phi);
4967             }
4968         }
4969     }
4970
4971   exit_gsi = gsi_after_labels (exit_bb);
4972
4973   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4974          (i.e. when reduc_fn is not available) and in the final adjustment
4975          code (if needed).  Also get the original scalar reduction variable as
4976          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4977          represents a reduction pattern), the tree-code and scalar-def are
4978          taken from the original stmt that the pattern-stmt (STMT) replaces.
4979          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4980          are taken from STMT.  */
4981
4982   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4983   if (!orig_stmt)
4984     {
4985       /* Regular reduction  */
4986       orig_stmt = stmt;
4987     }
4988   else
4989     {
4990       /* Reduction pattern  */
4991       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4992       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4993       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4994     }
4995
4996   code = gimple_assign_rhs_code (orig_stmt);
4997   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4998      partial results are added and not subtracted.  */
4999   if (code == MINUS_EXPR)
5000     code = PLUS_EXPR;
5001
5002   scalar_dest = gimple_assign_lhs (orig_stmt);
5003   scalar_type = TREE_TYPE (scalar_dest);
5004   scalar_results.create (group_size);
5005   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5006   bitsize = TYPE_SIZE (scalar_type);
5007
5008   /* In case this is a reduction in an inner-loop while vectorizing an outer
5009      loop - we don't need to extract a single scalar result at the end of the
5010      inner-loop (unless it is double reduction, i.e., the use of reduction is
5011      outside the outer-loop).  The final vector of partial results will be used
5012      in the vectorized outer-loop, or reduced to a scalar result at the end of
5013      the outer-loop.  */
5014   if (nested_in_vect_loop && !double_reduc)
5015     goto vect_finalize_reduction;
5016
5017   /* SLP reduction without reduction chain, e.g.,
5018      # a1 = phi <a2, a0>
5019      # b1 = phi <b2, b0>
5020      a2 = operation (a1)
5021      b2 = operation (b1)  */
5022   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5023
5024   /* True if we should implement SLP_REDUC using native reduction operations
5025      instead of scalar operations.  */
5026   direct_slp_reduc = (reduc_fn != IFN_LAST
5027                       && slp_reduc
5028                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5029
5030   /* In case of reduction chain, e.g.,
5031      # a1 = phi <a3, a0>
5032      a2 = operation (a1)
5033      a3 = operation (a2),
5034
5035      we may end up with more than one vector result.  Here we reduce them to
5036      one vector.  */
5037   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5038     {
5039       tree first_vect = PHI_RESULT (new_phis[0]);
5040       gassign *new_vec_stmt = NULL;
5041       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5042       for (k = 1; k < new_phis.length (); k++)
5043         {
5044           gimple *next_phi = new_phis[k];
5045           tree second_vect = PHI_RESULT (next_phi);
5046           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5047           new_vec_stmt = gimple_build_assign (tem, code,
5048                                               first_vect, second_vect);
5049           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5050           first_vect = tem;
5051         }
5052
5053       new_phi_result = first_vect;
5054       if (new_vec_stmt)
5055         {
5056           new_phis.truncate (0);
5057           new_phis.safe_push (new_vec_stmt);
5058         }
5059     }
5060   /* Likewise if we couldn't use a single defuse cycle.  */
5061   else if (ncopies > 1)
5062     {
5063       gcc_assert (new_phis.length () == 1);
5064       tree first_vect = PHI_RESULT (new_phis[0]);
5065       gassign *new_vec_stmt = NULL;
5066       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5067       gimple *next_phi = new_phis[0];
5068       for (int k = 1; k < ncopies; ++k)
5069         {
5070           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5071           tree second_vect = PHI_RESULT (next_phi);
5072           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5073           new_vec_stmt = gimple_build_assign (tem, code,
5074                                               first_vect, second_vect);
5075           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5076           first_vect = tem;
5077         }
5078       new_phi_result = first_vect;
5079       new_phis.truncate (0);
5080       new_phis.safe_push (new_vec_stmt);
5081     }
5082   else
5083     new_phi_result = PHI_RESULT (new_phis[0]);
5084
5085   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5086       && reduc_fn != IFN_LAST)
5087     {
5088       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5089          various data values where the condition matched and another vector
5090          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5091          need to extract the last matching index (which will be the index with
5092          highest value) and use this to index into the data vector.
5093          For the case where there were no matches, the data vector will contain
5094          all default values and the index vector will be all zeros.  */
5095
5096       /* Get various versions of the type of the vector of indexes.  */
5097       tree index_vec_type = TREE_TYPE (induction_index);
5098       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5099       tree index_scalar_type = TREE_TYPE (index_vec_type);
5100       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5101         (index_vec_type);
5102
5103       /* Get an unsigned integer version of the type of the data vector.  */
5104       int scalar_precision
5105         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5106       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5107       tree vectype_unsigned = build_vector_type
5108         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5109
5110       /* First we need to create a vector (ZERO_VEC) of zeros and another
5111          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5112          can create using a MAX reduction and then expanding.
5113          In the case where the loop never made any matches, the max index will
5114          be zero.  */
5115
5116       /* Vector of {0, 0, 0,...}.  */
5117       tree zero_vec = make_ssa_name (vectype);
5118       tree zero_vec_rhs = build_zero_cst (vectype);
5119       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5120       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5121
5122       /* Find maximum value from the vector of found indexes.  */
5123       tree max_index = make_ssa_name (index_scalar_type);
5124       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5125                                                           1, induction_index);
5126       gimple_call_set_lhs (max_index_stmt, max_index);
5127       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5128
5129       /* Vector of {max_index, max_index, max_index,...}.  */
5130       tree max_index_vec = make_ssa_name (index_vec_type);
5131       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5132                                                       max_index);
5133       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5134                                                         max_index_vec_rhs);
5135       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5136
5137       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5138          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5139          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5140          otherwise.  Only one value should match, resulting in a vector
5141          (VEC_COND) with one data value and the rest zeros.
5142          In the case where the loop never made any matches, every index will
5143          match, resulting in a vector with all data values (which will all be
5144          the default value).  */
5145
5146       /* Compare the max index vector to the vector of found indexes to find
5147          the position of the max value.  */
5148       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5149       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5150                                                       induction_index,
5151                                                       max_index_vec);
5152       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5153
5154       /* Use the compare to choose either values from the data vector or
5155          zero.  */
5156       tree vec_cond = make_ssa_name (vectype);
5157       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5158                                                    vec_compare, new_phi_result,
5159                                                    zero_vec);
5160       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5161
5162       /* Finally we need to extract the data value from the vector (VEC_COND)
5163          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5164          reduction, but because this doesn't exist, we can use a MAX reduction
5165          instead.  The data value might be signed or a float so we need to cast
5166          it first.
5167          In the case where the loop never made any matches, the data values are
5168          all identical, and so will reduce down correctly.  */
5169
5170       /* Make the matched data values unsigned.  */
5171       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5172       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5173                                        vec_cond);
5174       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5175                                                         VIEW_CONVERT_EXPR,
5176                                                         vec_cond_cast_rhs);
5177       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5178
5179       /* Reduce down to a scalar value.  */
5180       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5181       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5182                                                            1, vec_cond_cast);
5183       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5184       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5185
5186       /* Convert the reduced value back to the result type and set as the
5187          result.  */
5188       gimple_seq stmts = NULL;
5189       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5190                                data_reduc);
5191       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5192       scalar_results.safe_push (new_temp);
5193     }
5194   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5195            && reduc_fn == IFN_LAST)
5196     {
5197       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5198          idx = 0;
5199          idx_val = induction_index[0];
5200          val = data_reduc[0];
5201          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5202            if (induction_index[i] > idx_val)
5203              val = data_reduc[i], idx_val = induction_index[i];
5204          return val;  */
5205
5206       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5207       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5208       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5209       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5210       /* Enforced by vectorizable_reduction, which ensures we have target
5211          support before allowing a conditional reduction on variable-length
5212          vectors.  */
5213       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5214       tree idx_val = NULL_TREE, val = NULL_TREE;
5215       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5216         {
5217           tree old_idx_val = idx_val;
5218           tree old_val = val;
5219           idx_val = make_ssa_name (idx_eltype);
5220           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5221                                              build3 (BIT_FIELD_REF, idx_eltype,
5222                                                      induction_index,
5223                                                      bitsize_int (el_size),
5224                                                      bitsize_int (off)));
5225           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5226           val = make_ssa_name (data_eltype);
5227           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5228                                              build3 (BIT_FIELD_REF,
5229                                                      data_eltype,
5230                                                      new_phi_result,
5231                                                      bitsize_int (el_size),
5232                                                      bitsize_int (off)));
5233           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234           if (off != 0)
5235             {
5236               tree new_idx_val = idx_val;
5237               tree new_val = val;
5238               if (off != v_size - el_size)
5239                 {
5240                   new_idx_val = make_ssa_name (idx_eltype);
5241                   epilog_stmt = gimple_build_assign (new_idx_val,
5242                                                      MAX_EXPR, idx_val,
5243                                                      old_idx_val);
5244                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5245                 }
5246               new_val = make_ssa_name (data_eltype);
5247               epilog_stmt = gimple_build_assign (new_val,
5248                                                  COND_EXPR,
5249                                                  build2 (GT_EXPR,
5250                                                          boolean_type_node,
5251                                                          idx_val,
5252                                                          old_idx_val),
5253                                                  val, old_val);
5254               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5255               idx_val = new_idx_val;
5256               val = new_val;
5257             }
5258         }
5259       /* Convert the reduced value back to the result type and set as the
5260          result.  */
5261       gimple_seq stmts = NULL;
5262       val = gimple_convert (&stmts, scalar_type, val);
5263       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5264       scalar_results.safe_push (val);
5265     }
5266
5267   /* 2.3 Create the reduction code, using one of the three schemes described
5268          above. In SLP we simply need to extract all the elements from the
5269          vector (without reducing them), so we use scalar shifts.  */
5270   else if (reduc_fn != IFN_LAST && !slp_reduc)
5271     {
5272       tree tmp;
5273       tree vec_elem_type;
5274
5275       /* Case 1:  Create:
5276          v_out2 = reduc_expr <v_out1>  */
5277
5278       if (dump_enabled_p ())
5279         dump_printf_loc (MSG_NOTE, vect_location,
5280                          "Reduce using direct vector reduction.\n");
5281
5282       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5283       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5284         {
5285           tree tmp_dest
5286             = vect_create_destination_var (scalar_dest, vec_elem_type);
5287           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5288                                                     new_phi_result);
5289           gimple_set_lhs (epilog_stmt, tmp_dest);
5290           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5291           gimple_set_lhs (epilog_stmt, new_temp);
5292           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5293
5294           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5295                                              new_temp);
5296         }
5297       else
5298         {
5299           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5300                                                     new_phi_result);
5301           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5302         }
5303
5304       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5305       gimple_set_lhs (epilog_stmt, new_temp);
5306       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5307
5308       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5309            == INTEGER_INDUC_COND_REDUCTION)
5310           && !operand_equal_p (initial_def, induc_val, 0))
5311         {
5312           /* Earlier we set the initial value to be a vector if induc_val
5313              values.  Check the result and if it is induc_val then replace
5314              with the original initial value, unless induc_val is
5315              the same as initial_def already.  */
5316           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5317                                   induc_val);
5318
5319           tmp = make_ssa_name (new_scalar_dest);
5320           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5321                                              initial_def, new_temp);
5322           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5323           new_temp = tmp;
5324         }
5325
5326       scalar_results.safe_push (new_temp);
5327     }
5328   else if (direct_slp_reduc)
5329     {
5330       /* Here we create one vector for each of the GROUP_SIZE results,
5331          with the elements for other SLP statements replaced with the
5332          neutral value.  We can then do a normal reduction on each vector.  */
5333
5334       /* Enforced by vectorizable_reduction.  */
5335       gcc_assert (new_phis.length () == 1);
5336       gcc_assert (pow2p_hwi (group_size));
5337
5338       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5339       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5340       gimple_seq seq = NULL;
5341
5342       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5343          and the same element size as VECTYPE.  */
5344       tree index = build_index_vector (vectype, 0, 1);
5345       tree index_type = TREE_TYPE (index);
5346       tree index_elt_type = TREE_TYPE (index_type);
5347       tree mask_type = build_same_sized_truth_vector_type (index_type);
5348
5349       /* Create a vector that, for each element, identifies which of
5350          the GROUP_SIZE results should use it.  */
5351       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5352       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5353                             build_vector_from_val (index_type, index_mask));
5354
5355       /* Get a neutral vector value.  This is simply a splat of the neutral
5356          scalar value if we have one, otherwise the initial scalar value
5357          is itself a neutral value.  */
5358       tree vector_identity = NULL_TREE;
5359       if (neutral_op)
5360         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5361                                                         neutral_op);
5362       for (unsigned int i = 0; i < group_size; ++i)
5363         {
5364           /* If there's no univeral neutral value, we can use the
5365              initial scalar value from the original PHI.  This is used
5366              for MIN and MAX reduction, for example.  */
5367           if (!neutral_op)
5368             {
5369               tree scalar_value
5370                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5371                                          loop_preheader_edge (loop));
5372               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5373                                                               scalar_value);
5374             }
5375
5376           /* Calculate the equivalent of:
5377
5378              sel[j] = (index[j] == i);
5379
5380              which selects the elements of NEW_PHI_RESULT that should
5381              be included in the result.  */
5382           tree compare_val = build_int_cst (index_elt_type, i);
5383           compare_val = build_vector_from_val (index_type, compare_val);
5384           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5385                                    index, compare_val);
5386
5387           /* Calculate the equivalent of:
5388
5389              vec = seq ? new_phi_result : vector_identity;
5390
5391              VEC is now suitable for a full vector reduction.  */
5392           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5393                                    sel, new_phi_result, vector_identity);
5394
5395           /* Do the reduction and convert it to the appropriate type.  */
5396           gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5397           tree scalar = make_ssa_name (TREE_TYPE (vectype));
5398           gimple_call_set_lhs (call, scalar);
5399           gimple_seq_add_stmt (&seq, call);
5400           scalar = gimple_convert (&seq, scalar_type, scalar);
5401           scalar_results.safe_push (scalar);
5402         }
5403       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5404     }
5405   else
5406     {
5407       bool reduce_with_shift;
5408       tree vec_temp;
5409
5410       /* COND reductions all do the final reduction with MAX_EXPR
5411          or MIN_EXPR.  */
5412       if (code == COND_EXPR)
5413         {
5414           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5415               == INTEGER_INDUC_COND_REDUCTION)
5416             code = induc_code;
5417           else
5418             code = MAX_EXPR;
5419         }
5420
5421       /* See if the target wants to do the final (shift) reduction
5422          in a vector mode of smaller size and first reduce upper/lower
5423          halves against each other.  */
5424       enum machine_mode mode1 = mode;
5425       tree vectype1 = vectype;
5426       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5427       unsigned sz1 = sz;
5428       if (!slp_reduc
5429           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5430         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5431
5432       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5433       reduce_with_shift = have_whole_vector_shift (mode1);
5434       if (!VECTOR_MODE_P (mode1))
5435         reduce_with_shift = false;
5436       else
5437         {
5438           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5439           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5440             reduce_with_shift = false;
5441         }
5442
5443       /* First reduce the vector to the desired vector size we should
5444          do shift reduction on by combining upper and lower halves.  */
5445       new_temp = new_phi_result;
5446       while (sz > sz1)
5447         {
5448           gcc_assert (!slp_reduc);
5449           sz /= 2;
5450           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5451
5452           /* The target has to make sure we support lowpart/highpart
5453              extraction, either via direct vector extract or through
5454              an integer mode punning.  */
5455           tree dst1, dst2;
5456           if (convert_optab_handler (vec_extract_optab,
5457                                      TYPE_MODE (TREE_TYPE (new_temp)),
5458                                      TYPE_MODE (vectype1))
5459               != CODE_FOR_nothing)
5460             {
5461               /* Extract sub-vectors directly once vec_extract becomes
5462                  a conversion optab.  */
5463               dst1 = make_ssa_name (vectype1);
5464               epilog_stmt
5465                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5466                                          build3 (BIT_FIELD_REF, vectype1,
5467                                                  new_temp, TYPE_SIZE (vectype1),
5468                                                  bitsize_int (0)));
5469               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5470               dst2 =  make_ssa_name (vectype1);
5471               epilog_stmt
5472                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5473                                          build3 (BIT_FIELD_REF, vectype1,
5474                                                  new_temp, TYPE_SIZE (vectype1),
5475                                                  bitsize_int (sz * BITS_PER_UNIT)));
5476               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5477             }
5478           else
5479             {
5480               /* Extract via punning to appropriately sized integer mode
5481                  vector.  */
5482               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5483                                                             1);
5484               tree etype = build_vector_type (eltype, 2);
5485               gcc_assert (convert_optab_handler (vec_extract_optab,
5486                                                  TYPE_MODE (etype),
5487                                                  TYPE_MODE (eltype))
5488                           != CODE_FOR_nothing);
5489               tree tem = make_ssa_name (etype);
5490               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5491                                                  build1 (VIEW_CONVERT_EXPR,
5492                                                          etype, new_temp));
5493               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5494               new_temp = tem;
5495               tem = make_ssa_name (eltype);
5496               epilog_stmt
5497                   = gimple_build_assign (tem, BIT_FIELD_REF,
5498                                          build3 (BIT_FIELD_REF, eltype,
5499                                                  new_temp, TYPE_SIZE (eltype),
5500                                                  bitsize_int (0)));
5501               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5502               dst1 = make_ssa_name (vectype1);
5503               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5504                                                  build1 (VIEW_CONVERT_EXPR,
5505                                                          vectype1, tem));
5506               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5507               tem = make_ssa_name (eltype);
5508               epilog_stmt
5509                   = gimple_build_assign (tem, BIT_FIELD_REF,
5510                                          build3 (BIT_FIELD_REF, eltype,
5511                                                  new_temp, TYPE_SIZE (eltype),
5512                                                  bitsize_int (sz * BITS_PER_UNIT)));
5513               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5514               dst2 =  make_ssa_name (vectype1);
5515               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5516                                                  build1 (VIEW_CONVERT_EXPR,
5517                                                          vectype1, tem));
5518               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5519             }
5520
5521           new_temp = make_ssa_name (vectype1);
5522           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5523           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5524         }
5525
5526       if (reduce_with_shift && !slp_reduc)
5527         {
5528           int element_bitsize = tree_to_uhwi (bitsize);
5529           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5530              for variable-length vectors and also requires direct target support
5531              for loop reductions.  */
5532           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5533           int nelements = vec_size_in_bits / element_bitsize;
5534           vec_perm_builder sel;
5535           vec_perm_indices indices;
5536
5537           int elt_offset;
5538
5539           tree zero_vec = build_zero_cst (vectype1);
5540           /* Case 2: Create:
5541              for (offset = nelements/2; offset >= 1; offset/=2)
5542                 {
5543                   Create:  va' = vec_shift <va, offset>
5544                   Create:  va = vop <va, va'>
5545                 }  */
5546
5547           tree rhs;
5548
5549           if (dump_enabled_p ())
5550             dump_printf_loc (MSG_NOTE, vect_location,
5551                              "Reduce using vector shifts\n");
5552
5553           mode1 = TYPE_MODE (vectype1);
5554           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5555           for (elt_offset = nelements / 2;
5556                elt_offset >= 1;
5557                elt_offset /= 2)
5558             {
5559               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5560               indices.new_vector (sel, 2, nelements);
5561               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5562               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5563                                                  new_temp, zero_vec, mask);
5564               new_name = make_ssa_name (vec_dest, epilog_stmt);
5565               gimple_assign_set_lhs (epilog_stmt, new_name);
5566               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5567
5568               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5569                                                  new_temp);
5570               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5571               gimple_assign_set_lhs (epilog_stmt, new_temp);
5572               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5573             }
5574
5575           /* 2.4  Extract the final scalar result.  Create:
5576              s_out3 = extract_field <v_out2, bitpos>  */
5577
5578           if (dump_enabled_p ())
5579             dump_printf_loc (MSG_NOTE, vect_location,
5580                              "extract scalar result\n");
5581
5582           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5583                         bitsize, bitsize_zero_node);
5584           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5585           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5586           gimple_assign_set_lhs (epilog_stmt, new_temp);
5587           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5588           scalar_results.safe_push (new_temp);
5589         }
5590       else
5591         {
5592           /* Case 3: Create:
5593              s = extract_field <v_out2, 0>
5594              for (offset = element_size;
5595                   offset < vector_size;
5596                   offset += element_size;)
5597                {
5598                  Create:  s' = extract_field <v_out2, offset>
5599                  Create:  s = op <s, s'>  // For non SLP cases
5600                }  */
5601
5602           if (dump_enabled_p ())
5603             dump_printf_loc (MSG_NOTE, vect_location,
5604                              "Reduce using scalar code.\n");
5605
5606           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5607           int element_bitsize = tree_to_uhwi (bitsize);
5608           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5609             {
5610               int bit_offset;
5611               if (gimple_code (new_phi) == GIMPLE_PHI)
5612                 vec_temp = PHI_RESULT (new_phi);
5613               else
5614                 vec_temp = gimple_assign_lhs (new_phi);
5615               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5616                                  bitsize_zero_node);
5617               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5618               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5619               gimple_assign_set_lhs (epilog_stmt, new_temp);
5620               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5621
5622               /* In SLP we don't need to apply reduction operation, so we just
5623                  collect s' values in SCALAR_RESULTS.  */
5624               if (slp_reduc)
5625                 scalar_results.safe_push (new_temp);
5626
5627               for (bit_offset = element_bitsize;
5628                    bit_offset < vec_size_in_bits;
5629                    bit_offset += element_bitsize)
5630                 {
5631                   tree bitpos = bitsize_int (bit_offset);
5632                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5633                                      bitsize, bitpos);
5634
5635                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5636                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5637                   gimple_assign_set_lhs (epilog_stmt, new_name);
5638                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5639
5640                   if (slp_reduc)
5641                     {
5642                       /* In SLP we don't need to apply reduction operation, so
5643                          we just collect s' values in SCALAR_RESULTS.  */
5644                       new_temp = new_name;
5645                       scalar_results.safe_push (new_name);
5646                     }
5647                   else
5648                     {
5649                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5650                                                          new_name, new_temp);
5651                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5652                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5653                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5654                     }
5655                 }
5656             }
5657
5658           /* The only case where we need to reduce scalar results in SLP, is
5659              unrolling.  If the size of SCALAR_RESULTS is greater than
5660              GROUP_SIZE, we reduce them combining elements modulo
5661              GROUP_SIZE.  */
5662           if (slp_reduc)
5663             {
5664               tree res, first_res, new_res;
5665               gimple *new_stmt;
5666
5667               /* Reduce multiple scalar results in case of SLP unrolling.  */
5668               for (j = group_size; scalar_results.iterate (j, &res);
5669                    j++)
5670                 {
5671                   first_res = scalar_results[j % group_size];
5672                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5673                                                   first_res, res);
5674                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5675                   gimple_assign_set_lhs (new_stmt, new_res);
5676                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5677                   scalar_results[j % group_size] = new_res;
5678                 }
5679             }
5680           else
5681             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5682             scalar_results.safe_push (new_temp);
5683         }
5684
5685       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5686            == INTEGER_INDUC_COND_REDUCTION)
5687           && !operand_equal_p (initial_def, induc_val, 0))
5688         {
5689           /* Earlier we set the initial value to be a vector if induc_val
5690              values.  Check the result and if it is induc_val then replace
5691              with the original initial value, unless induc_val is
5692              the same as initial_def already.  */
5693           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5694                                   induc_val);
5695
5696           tree tmp = make_ssa_name (new_scalar_dest);
5697           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5698                                              initial_def, new_temp);
5699           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5700           scalar_results[0] = tmp;
5701         }
5702     }
5703
5704 vect_finalize_reduction:
5705
5706   if (double_reduc)
5707     loop = loop->inner;
5708
5709   /* 2.5 Adjust the final result by the initial value of the reduction
5710          variable. (When such adjustment is not needed, then
5711          'adjustment_def' is zero).  For example, if code is PLUS we create:
5712          new_temp = loop_exit_def + adjustment_def  */
5713
5714   if (adjustment_def)
5715     {
5716       gcc_assert (!slp_reduc);
5717       if (nested_in_vect_loop)
5718         {
5719           new_phi = new_phis[0];
5720           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5721           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5722           new_dest = vect_create_destination_var (scalar_dest, vectype);
5723         }
5724       else
5725         {
5726           new_temp = scalar_results[0];
5727           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5728           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5729           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5730         }
5731
5732       epilog_stmt = gimple_build_assign (new_dest, expr);
5733       new_temp = make_ssa_name (new_dest, epilog_stmt);
5734       gimple_assign_set_lhs (epilog_stmt, new_temp);
5735       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5736       if (nested_in_vect_loop)
5737         {
5738           set_vinfo_for_stmt (epilog_stmt,
5739                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5740           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5741                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5742
5743           if (!double_reduc)
5744             scalar_results.quick_push (new_temp);
5745           else
5746             scalar_results[0] = new_temp;
5747         }
5748       else
5749         scalar_results[0] = new_temp;
5750
5751       new_phis[0] = epilog_stmt;
5752     }
5753
5754   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5755           phis with new adjusted scalar results, i.e., replace use <s_out0>
5756           with use <s_out4>.
5757
5758      Transform:
5759         loop_exit:
5760           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5761           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5762           v_out2 = reduce <v_out1>
5763           s_out3 = extract_field <v_out2, 0>
5764           s_out4 = adjust_result <s_out3>
5765           use <s_out0>
5766           use <s_out0>
5767
5768      into:
5769
5770         loop_exit:
5771           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5772           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5773           v_out2 = reduce <v_out1>
5774           s_out3 = extract_field <v_out2, 0>
5775           s_out4 = adjust_result <s_out3>
5776           use <s_out4>
5777           use <s_out4> */
5778
5779
5780   /* In SLP reduction chain we reduce vector results into one vector if
5781      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5782      the last stmt in the reduction chain, since we are looking for the loop
5783      exit phi node.  */
5784   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5785     {
5786       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5787       /* Handle reduction patterns.  */
5788       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5789         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5790
5791       scalar_dest = gimple_assign_lhs (dest_stmt);
5792       group_size = 1;
5793     }
5794
5795   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5796      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5797      need to match SCALAR_RESULTS with corresponding statements.  The first
5798      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5799      the first vector stmt, etc.
5800      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5801   if (group_size > new_phis.length ())
5802     {
5803       ratio = group_size / new_phis.length ();
5804       gcc_assert (!(group_size % new_phis.length ()));
5805     }
5806   else
5807     ratio = 1;
5808
5809   for (k = 0; k < group_size; k++)
5810     {
5811       if (k % ratio == 0)
5812         {
5813           epilog_stmt = new_phis[k / ratio];
5814           reduction_phi = reduction_phis[k / ratio];
5815           if (double_reduc)
5816             inner_phi = inner_phis[k / ratio];
5817         }
5818
5819       if (slp_reduc)
5820         {
5821           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5822
5823           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5824           /* SLP statements can't participate in patterns.  */
5825           gcc_assert (!orig_stmt);
5826           scalar_dest = gimple_assign_lhs (current_stmt);
5827         }
5828
5829       phis.create (3);
5830       /* Find the loop-closed-use at the loop exit of the original scalar
5831          result.  (The reduction result is expected to have two immediate uses -
5832          one at the latch block, and one at the loop exit).  */
5833       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5834         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5835             && !is_gimple_debug (USE_STMT (use_p)))
5836           phis.safe_push (USE_STMT (use_p));
5837
5838       /* While we expect to have found an exit_phi because of loop-closed-ssa
5839          form we can end up without one if the scalar cycle is dead.  */
5840
5841       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5842         {
5843           if (outer_loop)
5844             {
5845               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5846               gphi *vect_phi;
5847
5848               /* FORNOW. Currently not supporting the case that an inner-loop
5849                  reduction is not used in the outer-loop (but only outside the
5850                  outer-loop), unless it is double reduction.  */
5851               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5852                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5853                           || double_reduc);
5854
5855               if (double_reduc)
5856                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5857               else
5858                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5859               if (!double_reduc
5860                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5861                       != vect_double_reduction_def)
5862                 continue;
5863
5864               /* Handle double reduction:
5865
5866                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5867                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5868                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5869                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5870
5871                  At that point the regular reduction (stmt2 and stmt3) is
5872                  already vectorized, as well as the exit phi node, stmt4.
5873                  Here we vectorize the phi node of double reduction, stmt1, and
5874                  update all relevant statements.  */
5875
5876               /* Go through all the uses of s2 to find double reduction phi
5877                  node, i.e., stmt1 above.  */
5878               orig_name = PHI_RESULT (exit_phi);
5879               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5880                 {
5881                   stmt_vec_info use_stmt_vinfo;
5882                   stmt_vec_info new_phi_vinfo;
5883                   tree vect_phi_init, preheader_arg, vect_phi_res;
5884                   basic_block bb = gimple_bb (use_stmt);
5885                   gimple *use;
5886
5887                   /* Check that USE_STMT is really double reduction phi
5888                      node.  */
5889                   if (gimple_code (use_stmt) != GIMPLE_PHI
5890                       || gimple_phi_num_args (use_stmt) != 2
5891                       || bb->loop_father != outer_loop)
5892                     continue;
5893                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5894                   if (!use_stmt_vinfo
5895                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5896                           != vect_double_reduction_def)
5897                     continue;
5898
5899                   /* Create vector phi node for double reduction:
5900                      vs1 = phi <vs0, vs2>
5901                      vs1 was created previously in this function by a call to
5902                        vect_get_vec_def_for_operand and is stored in
5903                        vec_initial_def;
5904                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5905                      vs0 is created here.  */
5906
5907                   /* Create vector phi node.  */
5908                   vect_phi = create_phi_node (vec_initial_def, bb);
5909                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5910                                     loop_vec_info_for_loop (outer_loop));
5911                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5912
5913                   /* Create vs0 - initial def of the double reduction phi.  */
5914                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5915                                              loop_preheader_edge (outer_loop));
5916                   vect_phi_init = get_initial_def_for_reduction
5917                     (stmt, preheader_arg, NULL);
5918
5919                   /* Update phi node arguments with vs0 and vs2.  */
5920                   add_phi_arg (vect_phi, vect_phi_init,
5921                                loop_preheader_edge (outer_loop),
5922                                UNKNOWN_LOCATION);
5923                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5924                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5925                   if (dump_enabled_p ())
5926                     {
5927                       dump_printf_loc (MSG_NOTE, vect_location,
5928                                        "created double reduction phi node: ");
5929                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5930                     }
5931
5932                   vect_phi_res = PHI_RESULT (vect_phi);
5933
5934                   /* Replace the use, i.e., set the correct vs1 in the regular
5935                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5936                      loop is redundant.  */
5937                   use = reduction_phi;
5938                   for (j = 0; j < ncopies; j++)
5939                     {
5940                       edge pr_edge = loop_preheader_edge (loop);
5941                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5942                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5943                     }
5944                 }
5945             }
5946         }
5947
5948       phis.release ();
5949       if (nested_in_vect_loop)
5950         {
5951           if (double_reduc)
5952             loop = outer_loop;
5953           else
5954             continue;
5955         }
5956
5957       phis.create (3);
5958       /* Find the loop-closed-use at the loop exit of the original scalar
5959          result.  (The reduction result is expected to have two immediate uses,
5960          one at the latch block, and one at the loop exit).  For double
5961          reductions we are looking for exit phis of the outer loop.  */
5962       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5963         {
5964           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5965             {
5966               if (!is_gimple_debug (USE_STMT (use_p)))
5967                 phis.safe_push (USE_STMT (use_p));
5968             }
5969           else
5970             {
5971               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5972                 {
5973                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5974
5975                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5976                     {
5977                       if (!flow_bb_inside_loop_p (loop,
5978                                              gimple_bb (USE_STMT (phi_use_p)))
5979                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5980                         phis.safe_push (USE_STMT (phi_use_p));
5981                     }
5982                 }
5983             }
5984         }
5985
5986       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5987         {
5988           /* Replace the uses:  */
5989           orig_name = PHI_RESULT (exit_phi);
5990           scalar_result = scalar_results[k];
5991           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5992             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5993               SET_USE (use_p, scalar_result);
5994         }
5995
5996       phis.release ();
5997     }
5998 }
5999
6000
6001 /* Function is_nonwrapping_integer_induction.
6002
6003    Check if STMT (which is part of loop LOOP) both increments and
6004    does not cause overflow.  */
6005
6006 static bool
6007 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6008 {
6009   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6010   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6011   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6012   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6013   widest_int ni, max_loop_value, lhs_max;
6014   bool overflow = false;
6015
6016   /* Make sure the loop is integer based.  */
6017   if (TREE_CODE (base) != INTEGER_CST
6018       || TREE_CODE (step) != INTEGER_CST)
6019     return false;
6020
6021   /* Check that the max size of the loop will not wrap.  */
6022
6023   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6024     return true;
6025
6026   if (! max_stmt_executions (loop, &ni))
6027     return false;
6028
6029   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6030                             &overflow);
6031   if (overflow)
6032     return false;
6033
6034   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6035                             TYPE_SIGN (lhs_type), &overflow);
6036   if (overflow)
6037     return false;
6038
6039   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6040           <= TYPE_PRECISION (lhs_type));
6041 }
6042
6043 /* Function vectorizable_reduction.
6044
6045    Check if STMT performs a reduction operation that can be vectorized.
6046    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6047    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6048    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6049
6050    This function also handles reduction idioms (patterns) that have been
6051    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6052    of this form:
6053      X = pattern_expr (arg0, arg1, ..., X)
6054    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6055    sequence that had been detected and replaced by the pattern-stmt (STMT).
6056
6057    This function also handles reduction of condition expressions, for example:
6058      for (int i = 0; i < N; i++)
6059        if (a[i] < value)
6060          last = a[i];
6061    This is handled by vectorising the loop and creating an additional vector
6062    containing the loop indexes for which "a[i] < value" was true.  In the
6063    function epilogue this is reduced to a single max value and then used to
6064    index into the vector of results.
6065
6066    In some cases of reduction patterns, the type of the reduction variable X is
6067    different than the type of the other arguments of STMT.
6068    In such cases, the vectype that is used when transforming STMT into a vector
6069    stmt is different than the vectype that is used to determine the
6070    vectorization factor, because it consists of a different number of elements
6071    than the actual number of elements that are being operated upon in parallel.
6072
6073    For example, consider an accumulation of shorts into an int accumulator.
6074    On some targets it's possible to vectorize this pattern operating on 8
6075    shorts at a time (hence, the vectype for purposes of determining the
6076    vectorization factor should be V8HI); on the other hand, the vectype that
6077    is used to create the vector form is actually V4SI (the type of the result).
6078
6079    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6080    indicates what is the actual level of parallelism (V8HI in the example), so
6081    that the right vectorization factor would be derived.  This vectype
6082    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6083    be used to create the vectorized stmt.  The right vectype for the vectorized
6084    stmt is obtained from the type of the result X:
6085         get_vectype_for_scalar_type (TREE_TYPE (X))
6086
6087    This means that, contrary to "regular" reductions (or "regular" stmts in
6088    general), the following equation:
6089       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6090    does *NOT* necessarily hold for reduction patterns.  */
6091
6092 bool
6093 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6094                         gimple **vec_stmt, slp_tree slp_node,
6095                         slp_instance slp_node_instance)
6096 {
6097   tree vec_dest;
6098   tree scalar_dest;
6099   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6100   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6101   tree vectype_in = NULL_TREE;
6102   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6103   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6104   enum tree_code code, orig_code;
6105   internal_fn reduc_fn;
6106   machine_mode vec_mode;
6107   int op_type;
6108   optab optab;
6109   tree new_temp = NULL_TREE;
6110   gimple *def_stmt;
6111   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6112   gimple *cond_reduc_def_stmt = NULL;
6113   enum tree_code cond_reduc_op_code = ERROR_MARK;
6114   tree scalar_type;
6115   bool is_simple_use;
6116   gimple *orig_stmt;
6117   stmt_vec_info orig_stmt_info = NULL;
6118   int i;
6119   int ncopies;
6120   int epilog_copies;
6121   stmt_vec_info prev_stmt_info, prev_phi_info;
6122   bool single_defuse_cycle = false;
6123   gimple *new_stmt = NULL;
6124   int j;
6125   tree ops[3];
6126   enum vect_def_type dts[3];
6127   bool nested_cycle = false, found_nested_cycle_def = false;
6128   bool double_reduc = false;
6129   basic_block def_bb;
6130   struct loop * def_stmt_loop, *outer_loop = NULL;
6131   tree def_arg;
6132   gimple *def_arg_stmt;
6133   auto_vec<tree> vec_oprnds0;
6134   auto_vec<tree> vec_oprnds1;
6135   auto_vec<tree> vec_oprnds2;
6136   auto_vec<tree> vect_defs;
6137   auto_vec<gimple *> phis;
6138   int vec_num;
6139   tree def0, tem;
6140   bool first_p = true;
6141   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6142   tree cond_reduc_val = NULL_TREE;
6143
6144   /* Make sure it was already recognized as a reduction computation.  */
6145   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6146       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6147     return false;
6148
6149   if (nested_in_vect_loop_p (loop, stmt))
6150     {
6151       outer_loop = loop;
6152       loop = loop->inner;
6153       nested_cycle = true;
6154     }
6155
6156   /* In case of reduction chain we switch to the first stmt in the chain, but
6157      we don't update STMT_INFO, since only the last stmt is marked as reduction
6158      and has reduction properties.  */
6159   if (GROUP_FIRST_ELEMENT (stmt_info)
6160       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6161     {
6162       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6163       first_p = false;
6164     }
6165
6166   if (gimple_code (stmt) == GIMPLE_PHI)
6167     {
6168       /* Analysis is fully done on the reduction stmt invocation.  */
6169       if (! vec_stmt)
6170         {
6171           if (slp_node)
6172             slp_node_instance->reduc_phis = slp_node;
6173
6174           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6175           return true;
6176         }
6177
6178       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6179       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6180         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6181
6182       gcc_assert (is_gimple_assign (reduc_stmt));
6183       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6184         {
6185           tree op = gimple_op (reduc_stmt, k);
6186           if (op == gimple_phi_result (stmt))
6187             continue;
6188           if (k == 1
6189               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6190             continue;
6191           if (!vectype_in
6192               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6193                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6194             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6195           break;
6196         }
6197       gcc_assert (vectype_in);
6198
6199       if (slp_node)
6200         ncopies = 1;
6201       else
6202         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6203
6204       use_operand_p use_p;
6205       gimple *use_stmt;
6206       if (ncopies > 1
6207           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6208               <= vect_used_only_live)
6209           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6210           && (use_stmt == reduc_stmt
6211               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6212                   == reduc_stmt)))
6213         single_defuse_cycle = true;
6214
6215       /* Create the destination vector  */
6216       scalar_dest = gimple_assign_lhs (reduc_stmt);
6217       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6218
6219       if (slp_node)
6220         /* The size vect_schedule_slp_instance computes is off for us.  */
6221         vec_num = vect_get_num_vectors
6222           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6223            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6224            vectype_in);
6225       else
6226         vec_num = 1;
6227
6228       /* Generate the reduction PHIs upfront.  */
6229       prev_phi_info = NULL;
6230       for (j = 0; j < ncopies; j++)
6231         {
6232           if (j == 0 || !single_defuse_cycle)
6233             {
6234               for (i = 0; i < vec_num; i++)
6235                 {
6236                   /* Create the reduction-phi that defines the reduction
6237                      operand.  */
6238                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6239                   set_vinfo_for_stmt (new_phi,
6240                                       new_stmt_vec_info (new_phi, loop_vinfo));
6241
6242                   if (slp_node)
6243                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6244                   else
6245                     {
6246                       if (j == 0)
6247                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6248                       else
6249                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6250                       prev_phi_info = vinfo_for_stmt (new_phi);
6251                     }
6252                 }
6253             }
6254         }
6255
6256       return true;
6257     }
6258
6259   /* 1. Is vectorizable reduction?  */
6260   /* Not supportable if the reduction variable is used in the loop, unless
6261      it's a reduction chain.  */
6262   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6263       && !GROUP_FIRST_ELEMENT (stmt_info))
6264     return false;
6265
6266   /* Reductions that are not used even in an enclosing outer-loop,
6267      are expected to be "live" (used out of the loop).  */
6268   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6269       && !STMT_VINFO_LIVE_P (stmt_info))
6270     return false;
6271
6272   /* 2. Has this been recognized as a reduction pattern?
6273
6274      Check if STMT represents a pattern that has been recognized
6275      in earlier analysis stages.  For stmts that represent a pattern,
6276      the STMT_VINFO_RELATED_STMT field records the last stmt in
6277      the original sequence that constitutes the pattern.  */
6278
6279   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6280   if (orig_stmt)
6281     {
6282       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6283       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6284       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6285     }
6286
6287   /* 3. Check the operands of the operation.  The first operands are defined
6288         inside the loop body. The last operand is the reduction variable,
6289         which is defined by the loop-header-phi.  */
6290
6291   gcc_assert (is_gimple_assign (stmt));
6292
6293   /* Flatten RHS.  */
6294   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6295     {
6296     case GIMPLE_BINARY_RHS:
6297       code = gimple_assign_rhs_code (stmt);
6298       op_type = TREE_CODE_LENGTH (code);
6299       gcc_assert (op_type == binary_op);
6300       ops[0] = gimple_assign_rhs1 (stmt);
6301       ops[1] = gimple_assign_rhs2 (stmt);
6302       break;
6303
6304     case GIMPLE_TERNARY_RHS:
6305       code = gimple_assign_rhs_code (stmt);
6306       op_type = TREE_CODE_LENGTH (code);
6307       gcc_assert (op_type == ternary_op);
6308       ops[0] = gimple_assign_rhs1 (stmt);
6309       ops[1] = gimple_assign_rhs2 (stmt);
6310       ops[2] = gimple_assign_rhs3 (stmt);
6311       break;
6312
6313     case GIMPLE_UNARY_RHS:
6314       return false;
6315
6316     default:
6317       gcc_unreachable ();
6318     }
6319
6320   if (code == COND_EXPR && slp_node)
6321     return false;
6322
6323   scalar_dest = gimple_assign_lhs (stmt);
6324   scalar_type = TREE_TYPE (scalar_dest);
6325   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6326       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6327     return false;
6328
6329   /* Do not try to vectorize bit-precision reductions.  */
6330   if (!type_has_mode_precision_p (scalar_type))
6331     return false;
6332
6333   /* All uses but the last are expected to be defined in the loop.
6334      The last use is the reduction variable.  In case of nested cycle this
6335      assumption is not true: we use reduc_index to record the index of the
6336      reduction variable.  */
6337   gimple *reduc_def_stmt = NULL;
6338   int reduc_index = -1;
6339   for (i = 0; i < op_type; i++)
6340     {
6341       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6342       if (i == 0 && code == COND_EXPR)
6343         continue;
6344
6345       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6346                                           &def_stmt, &dts[i], &tem);
6347       dt = dts[i];
6348       gcc_assert (is_simple_use);
6349       if (dt == vect_reduction_def)
6350         {
6351           reduc_def_stmt = def_stmt;
6352           reduc_index = i;
6353           continue;
6354         }
6355       else if (tem)
6356         {
6357           /* To properly compute ncopies we are interested in the widest
6358              input type in case we're looking at a widening accumulation.  */
6359           if (!vectype_in
6360               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6361                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6362             vectype_in = tem;
6363         }
6364
6365       if (dt != vect_internal_def
6366           && dt != vect_external_def
6367           && dt != vect_constant_def
6368           && dt != vect_induction_def
6369           && !(dt == vect_nested_cycle && nested_cycle))
6370         return false;
6371
6372       if (dt == vect_nested_cycle)
6373         {
6374           found_nested_cycle_def = true;
6375           reduc_def_stmt = def_stmt;
6376           reduc_index = i;
6377         }
6378
6379       if (i == 1 && code == COND_EXPR)
6380         {
6381           /* Record how value of COND_EXPR is defined.  */
6382           if (dt == vect_constant_def)
6383             {
6384               cond_reduc_dt = dt;
6385               cond_reduc_val = ops[i];
6386             }
6387           if (dt == vect_induction_def
6388               && def_stmt != NULL
6389               && is_nonwrapping_integer_induction (def_stmt, loop))
6390             {
6391               cond_reduc_dt = dt;
6392               cond_reduc_def_stmt = def_stmt;
6393             }
6394         }
6395     }
6396
6397   if (!vectype_in)
6398     vectype_in = vectype_out;
6399
6400   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6401      directy used in stmt.  */
6402   if (reduc_index == -1)
6403     {
6404       if (orig_stmt)
6405         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6406       else
6407         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6408     }
6409
6410   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6411     return false;
6412
6413   if (!(reduc_index == -1
6414         || dts[reduc_index] == vect_reduction_def
6415         || dts[reduc_index] == vect_nested_cycle
6416         || ((dts[reduc_index] == vect_internal_def
6417              || dts[reduc_index] == vect_external_def
6418              || dts[reduc_index] == vect_constant_def
6419              || dts[reduc_index] == vect_induction_def)
6420             && nested_cycle && found_nested_cycle_def)))
6421     {
6422       /* For pattern recognized stmts, orig_stmt might be a reduction,
6423          but some helper statements for the pattern might not, or
6424          might be COND_EXPRs with reduction uses in the condition.  */
6425       gcc_assert (orig_stmt);
6426       return false;
6427     }
6428
6429   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6430   enum vect_reduction_type v_reduc_type
6431     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6432   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6433
6434   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6435   /* If we have a condition reduction, see if we can simplify it further.  */
6436   if (v_reduc_type == COND_REDUCTION)
6437     {
6438       if (cond_reduc_dt == vect_induction_def)
6439         {
6440           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6441           tree base
6442             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6443           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6444
6445           gcc_assert (TREE_CODE (base) == INTEGER_CST
6446                       && TREE_CODE (step) == INTEGER_CST);
6447           cond_reduc_val = NULL_TREE;
6448           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6449              above base; punt if base is the minimum value of the type for
6450              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6451           if (tree_int_cst_sgn (step) == -1)
6452             {
6453               cond_reduc_op_code = MIN_EXPR;
6454               if (tree_int_cst_sgn (base) == -1)
6455                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6456               else if (tree_int_cst_lt (base,
6457                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6458                 cond_reduc_val
6459                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6460             }
6461           else
6462             {
6463               cond_reduc_op_code = MAX_EXPR;
6464               if (tree_int_cst_sgn (base) == 1)
6465                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6466               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6467                                         base))
6468                 cond_reduc_val
6469                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6470             }
6471           if (cond_reduc_val)
6472             {
6473               if (dump_enabled_p ())
6474                 dump_printf_loc (MSG_NOTE, vect_location,
6475                                  "condition expression based on "
6476                                  "integer induction.\n");
6477               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6478                 = INTEGER_INDUC_COND_REDUCTION;
6479             }
6480         }
6481
6482       /* Loop peeling modifies initial value of reduction PHI, which
6483          makes the reduction stmt to be transformed different to the
6484          original stmt analyzed.  We need to record reduction code for
6485          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6486          it can be used directly at transform stage.  */
6487       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6488           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6489         {
6490           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6491           gcc_assert (cond_reduc_dt == vect_constant_def);
6492           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6493         }
6494       else if (cond_reduc_dt == vect_constant_def)
6495         {
6496           enum vect_def_type cond_initial_dt;
6497           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6498           tree cond_initial_val
6499             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6500
6501           gcc_assert (cond_reduc_val != NULL_TREE);
6502           vect_is_simple_use (cond_initial_val, loop_vinfo,
6503                               &def_stmt, &cond_initial_dt);
6504           if (cond_initial_dt == vect_constant_def
6505               && types_compatible_p (TREE_TYPE (cond_initial_val),
6506                                      TREE_TYPE (cond_reduc_val)))
6507             {
6508               tree e = fold_binary (LE_EXPR, boolean_type_node,
6509                                     cond_initial_val, cond_reduc_val);
6510               if (e && (integer_onep (e) || integer_zerop (e)))
6511                 {
6512                   if (dump_enabled_p ())
6513                     dump_printf_loc (MSG_NOTE, vect_location,
6514                                      "condition expression based on "
6515                                      "compile time constant.\n");
6516                   /* Record reduction code at analysis stage.  */
6517                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6518                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6519                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6520                     = CONST_COND_REDUCTION;
6521                 }
6522             }
6523         }
6524     }
6525
6526   if (orig_stmt)
6527     gcc_assert (tmp == orig_stmt
6528                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6529   else
6530     /* We changed STMT to be the first stmt in reduction chain, hence we
6531        check that in this case the first element in the chain is STMT.  */
6532     gcc_assert (stmt == tmp
6533                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6534
6535   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6536     return false;
6537
6538   if (slp_node)
6539     ncopies = 1;
6540   else
6541     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6542
6543   gcc_assert (ncopies >= 1);
6544
6545   vec_mode = TYPE_MODE (vectype_in);
6546   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6547
6548   if (code == COND_EXPR)
6549     {
6550       /* Only call during the analysis stage, otherwise we'll lose
6551          STMT_VINFO_TYPE.  */
6552       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6553                                                 ops[reduc_index], 0, NULL))
6554         {
6555           if (dump_enabled_p ())
6556             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6557                              "unsupported condition in reduction\n");
6558           return false;
6559         }
6560     }
6561   else
6562     {
6563       /* 4. Supportable by target?  */
6564
6565       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6566           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6567         {
6568           /* Shifts and rotates are only supported by vectorizable_shifts,
6569              not vectorizable_reduction.  */
6570           if (dump_enabled_p ())
6571             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6572                              "unsupported shift or rotation.\n");
6573           return false;
6574         }
6575
6576       /* 4.1. check support for the operation in the loop  */
6577       optab = optab_for_tree_code (code, vectype_in, optab_default);
6578       if (!optab)
6579         {
6580           if (dump_enabled_p ())
6581             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6582                              "no optab.\n");
6583
6584           return false;
6585         }
6586
6587       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6588         {
6589           if (dump_enabled_p ())
6590             dump_printf (MSG_NOTE, "op not supported by target.\n");
6591
6592           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6593               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6594             return false;
6595
6596           if (dump_enabled_p ())
6597             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6598         }
6599
6600       /* Worthwhile without SIMD support?  */
6601       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6602           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6603         {
6604           if (dump_enabled_p ())
6605             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6606                              "not worthwhile without SIMD support.\n");
6607
6608           return false;
6609         }
6610     }
6611
6612   /* 4.2. Check support for the epilog operation.
6613
6614           If STMT represents a reduction pattern, then the type of the
6615           reduction variable may be different than the type of the rest
6616           of the arguments.  For example, consider the case of accumulation
6617           of shorts into an int accumulator; The original code:
6618                         S1: int_a = (int) short_a;
6619           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6620
6621           was replaced with:
6622                         STMT: int_acc = widen_sum <short_a, int_acc>
6623
6624           This means that:
6625           1. The tree-code that is used to create the vector operation in the
6626              epilog code (that reduces the partial results) is not the
6627              tree-code of STMT, but is rather the tree-code of the original
6628              stmt from the pattern that STMT is replacing.  I.e, in the example
6629              above we want to use 'widen_sum' in the loop, but 'plus' in the
6630              epilog.
6631           2. The type (mode) we use to check available target support
6632              for the vector operation to be created in the *epilog*, is
6633              determined by the type of the reduction variable (in the example
6634              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6635              However the type (mode) we use to check available target support
6636              for the vector operation to be created *inside the loop*, is
6637              determined by the type of the other arguments to STMT (in the
6638              example we'd check this: optab_handler (widen_sum_optab,
6639              vect_short_mode)).
6640
6641           This is contrary to "regular" reductions, in which the types of all
6642           the arguments are the same as the type of the reduction variable.
6643           For "regular" reductions we can therefore use the same vector type
6644           (and also the same tree-code) when generating the epilog code and
6645           when generating the code inside the loop.  */
6646
6647   if (orig_stmt)
6648     {
6649       /* This is a reduction pattern: get the vectype from the type of the
6650          reduction variable, and get the tree-code from orig_stmt.  */
6651       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6652                   == TREE_CODE_REDUCTION);
6653       orig_code = gimple_assign_rhs_code (orig_stmt);
6654       gcc_assert (vectype_out);
6655       vec_mode = TYPE_MODE (vectype_out);
6656     }
6657   else
6658     {
6659       /* Regular reduction: use the same vectype and tree-code as used for
6660          the vector code inside the loop can be used for the epilog code. */
6661       orig_code = code;
6662
6663       if (code == MINUS_EXPR)
6664         orig_code = PLUS_EXPR;
6665
6666       /* For simple condition reductions, replace with the actual expression
6667          we want to base our reduction around.  */
6668       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6669         {
6670           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6671           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6672         }
6673       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6674                == INTEGER_INDUC_COND_REDUCTION)
6675         orig_code = cond_reduc_op_code;
6676     }
6677
6678   if (nested_cycle)
6679     {
6680       def_bb = gimple_bb (reduc_def_stmt);
6681       def_stmt_loop = def_bb->loop_father;
6682       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6683                                        loop_preheader_edge (def_stmt_loop));
6684       if (TREE_CODE (def_arg) == SSA_NAME
6685           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6686           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6687           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6688           && vinfo_for_stmt (def_arg_stmt)
6689           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6690               == vect_double_reduction_def)
6691         double_reduc = true;
6692     }
6693
6694   reduc_fn = IFN_LAST;
6695
6696   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6697     {
6698       if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6699         {
6700           if (reduc_fn != IFN_LAST
6701               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6702                                                   OPTIMIZE_FOR_SPEED))
6703             {
6704               if (dump_enabled_p ())
6705                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6706                                  "reduc op not supported by target.\n");
6707
6708               reduc_fn = IFN_LAST;
6709             }
6710         }
6711       else
6712         {
6713           if (!nested_cycle || double_reduc)
6714             {
6715               if (dump_enabled_p ())
6716                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6717                                  "no reduc code for scalar code.\n");
6718
6719               return false;
6720             }
6721         }
6722     }
6723   else
6724     {
6725       int scalar_precision
6726         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6727       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6728       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6729                                                 nunits_out);
6730
6731       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6732                                           OPTIMIZE_FOR_SPEED))
6733         reduc_fn = IFN_REDUC_MAX;
6734     }
6735
6736   if (reduc_fn == IFN_LAST && !nunits_out.is_constant ())
6737     {
6738       if (dump_enabled_p ())
6739         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6740                          "missing target support for reduction on"
6741                          " variable-length vectors.\n");
6742       return false;
6743     }
6744
6745   if ((double_reduc
6746        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6747       && ncopies > 1)
6748     {
6749       if (dump_enabled_p ())
6750         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6751                          "multiple types in double reduction or condition "
6752                          "reduction.\n");
6753       return false;
6754     }
6755
6756   /* For SLP reductions, see if there is a neutral value we can use.  */
6757   tree neutral_op = NULL_TREE;
6758   if (slp_node)
6759     neutral_op
6760       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
6761                                       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6762
6763   /* For double reductions, and for SLP reductions with a neutral value,
6764      we construct a variable-length initial vector by loading a vector
6765      full of the neutral value and then shift-and-inserting the start
6766      values into the low-numbered elements.  */
6767   if ((double_reduc || neutral_op)
6768       && !nunits_out.is_constant ()
6769       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6770                                           vectype_out, OPTIMIZE_FOR_SPEED))
6771     {
6772       if (dump_enabled_p ())
6773         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6774                          "reduction on variable-length vectors requires"
6775                          " target support for a vector-shift-and-insert"
6776                          " operation.\n");
6777       return false;
6778     }
6779
6780   /* Check extra constraints for variable-length unchained SLP reductions.  */
6781   if (STMT_SLP_TYPE (stmt_info)
6782       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6783       && !nunits_out.is_constant ())
6784     {
6785       /* We checked above that we could build the initial vector when
6786          there's a neutral element value.  Check here for the case in
6787          which each SLP statement has its own initial value and in which
6788          that value needs to be repeated for every instance of the
6789          statement within the initial vector.  */
6790       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6791       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6792       if (!neutral_op
6793           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6794         {
6795           if (dump_enabled_p ())
6796             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6797                              "unsupported form of SLP reduction for"
6798                              " variable-length vectors: cannot build"
6799                              " initial vector.\n");
6800           return false;
6801         }
6802       /* The epilogue code relies on the number of elements being a multiple
6803          of the group size.  The duplicate-and-interleave approach to setting
6804          up the the initial vector does too.  */
6805       if (!multiple_p (nunits_out, group_size))
6806         {
6807           if (dump_enabled_p ())
6808             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6809                              "unsupported form of SLP reduction for"
6810                              " variable-length vectors: the vector size"
6811                              " is not a multiple of the number of results.\n");
6812           return false;
6813         }
6814     }
6815
6816   /* In case of widenning multiplication by a constant, we update the type
6817      of the constant to be the type of the other operand.  We check that the
6818      constant fits the type in the pattern recognition pass.  */
6819   if (code == DOT_PROD_EXPR
6820       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6821     {
6822       if (TREE_CODE (ops[0]) == INTEGER_CST)
6823         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6824       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6825         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6826       else
6827         {
6828           if (dump_enabled_p ())
6829             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6830                              "invalid types in dot-prod\n");
6831
6832           return false;
6833         }
6834     }
6835
6836   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6837     {
6838       widest_int ni;
6839
6840       if (! max_loop_iterations (loop, &ni))
6841         {
6842           if (dump_enabled_p ())
6843             dump_printf_loc (MSG_NOTE, vect_location,
6844                              "loop count not known, cannot create cond "
6845                              "reduction.\n");
6846           return false;
6847         }
6848       /* Convert backedges to iterations.  */
6849       ni += 1;
6850
6851       /* The additional index will be the same type as the condition.  Check
6852          that the loop can fit into this less one (because we'll use up the
6853          zero slot for when there are no matches).  */
6854       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6855       if (wi::geu_p (ni, wi::to_widest (max_index)))
6856         {
6857           if (dump_enabled_p ())
6858             dump_printf_loc (MSG_NOTE, vect_location,
6859                              "loop size is greater than data size.\n");
6860           return false;
6861         }
6862     }
6863
6864   /* In case the vectorization factor (VF) is bigger than the number
6865      of elements that we can fit in a vectype (nunits), we have to generate
6866      more than one vector stmt - i.e - we need to "unroll" the
6867      vector stmt by a factor VF/nunits.  For more details see documentation
6868      in vectorizable_operation.  */
6869
6870   /* If the reduction is used in an outer loop we need to generate
6871      VF intermediate results, like so (e.g. for ncopies=2):
6872         r0 = phi (init, r0)
6873         r1 = phi (init, r1)
6874         r0 = x0 + r0;
6875         r1 = x1 + r1;
6876     (i.e. we generate VF results in 2 registers).
6877     In this case we have a separate def-use cycle for each copy, and therefore
6878     for each copy we get the vector def for the reduction variable from the
6879     respective phi node created for this copy.
6880
6881     Otherwise (the reduction is unused in the loop nest), we can combine
6882     together intermediate results, like so (e.g. for ncopies=2):
6883         r = phi (init, r)
6884         r = x0 + r;
6885         r = x1 + r;
6886    (i.e. we generate VF/2 results in a single register).
6887    In this case for each copy we get the vector def for the reduction variable
6888    from the vectorized reduction operation generated in the previous iteration.
6889
6890    This only works when we see both the reduction PHI and its only consumer
6891    in vectorizable_reduction and there are no intermediate stmts
6892    participating.  */
6893   use_operand_p use_p;
6894   gimple *use_stmt;
6895   if (ncopies > 1
6896       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6897       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6898       && (use_stmt == stmt
6899           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6900     {
6901       single_defuse_cycle = true;
6902       epilog_copies = 1;
6903     }
6904   else
6905     epilog_copies = ncopies;
6906
6907   /* If the reduction stmt is one of the patterns that have lane
6908      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6909   if ((ncopies > 1
6910        && ! single_defuse_cycle)
6911       && (code == DOT_PROD_EXPR
6912           || code == WIDEN_SUM_EXPR
6913           || code == SAD_EXPR))
6914     {
6915       if (dump_enabled_p ())
6916         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6917                          "multi def-use cycle not possible for lane-reducing "
6918                          "reduction operation\n");
6919       return false;
6920     }
6921
6922   if (slp_node)
6923     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6924   else
6925     vec_num = 1;
6926
6927   internal_fn cond_fn = get_conditional_internal_fn (code);
6928   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6929
6930   if (!vec_stmt) /* transformation not required.  */
6931     {
6932       if (first_p)
6933         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6934       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6935         {
6936           if (cond_fn == IFN_LAST
6937               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6938                                                   OPTIMIZE_FOR_SPEED))
6939             {
6940               if (dump_enabled_p ())
6941                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6942                                  "can't use a fully-masked loop because no"
6943                                  " conditional operation is available.\n");
6944               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6945             }
6946           else if (reduc_index == -1)
6947             {
6948               if (dump_enabled_p ())
6949                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6950                                  "can't use a fully-masked loop for chained"
6951                                  " reductions.\n");
6952               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6953             }
6954           else
6955             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6956                                    vectype_in);
6957         }
6958       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6959       return true;
6960     }
6961
6962   /* Transform.  */
6963
6964   if (dump_enabled_p ())
6965     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6966
6967   /* FORNOW: Multiple types are not supported for condition.  */
6968   if (code == COND_EXPR)
6969     gcc_assert (ncopies == 1);
6970
6971   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6972
6973   /* Create the destination vector  */
6974   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6975
6976   prev_stmt_info = NULL;
6977   prev_phi_info = NULL;
6978   if (!slp_node)
6979     {
6980       vec_oprnds0.create (1);
6981       vec_oprnds1.create (1);
6982       if (op_type == ternary_op)
6983         vec_oprnds2.create (1);
6984     }
6985
6986   phis.create (vec_num);
6987   vect_defs.create (vec_num);
6988   if (!slp_node)
6989     vect_defs.quick_push (NULL_TREE);
6990
6991   if (slp_node)
6992     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6993   else
6994     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6995
6996   for (j = 0; j < ncopies; j++)
6997     {
6998       if (code == COND_EXPR)
6999         {
7000           gcc_assert (!slp_node);
7001           vectorizable_condition (stmt, gsi, vec_stmt,
7002                                   PHI_RESULT (phis[0]),
7003                                   reduc_index, NULL);
7004           /* Multiple types are not supported for condition.  */
7005           break;
7006         }
7007
7008       /* Handle uses.  */
7009       if (j == 0)
7010         {
7011           if (slp_node)
7012             {
7013               /* Get vec defs for all the operands except the reduction index,
7014                  ensuring the ordering of the ops in the vector is kept.  */
7015               auto_vec<tree, 3> slp_ops;
7016               auto_vec<vec<tree>, 3> vec_defs;
7017
7018               slp_ops.quick_push (ops[0]);
7019               slp_ops.quick_push (ops[1]);
7020               if (op_type == ternary_op)
7021                 slp_ops.quick_push (ops[2]);
7022
7023               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7024
7025               vec_oprnds0.safe_splice (vec_defs[0]);
7026               vec_defs[0].release ();
7027               vec_oprnds1.safe_splice (vec_defs[1]);
7028               vec_defs[1].release ();
7029               if (op_type == ternary_op)
7030                 {
7031                   vec_oprnds2.safe_splice (vec_defs[2]);
7032                   vec_defs[2].release ();
7033                 }
7034             }
7035           else
7036             {
7037               vec_oprnds0.quick_push
7038                 (vect_get_vec_def_for_operand (ops[0], stmt));
7039               vec_oprnds1.quick_push
7040                 (vect_get_vec_def_for_operand (ops[1], stmt));
7041               if (op_type == ternary_op)
7042                 vec_oprnds2.quick_push
7043                   (vect_get_vec_def_for_operand (ops[2], stmt));
7044             }
7045         }
7046       else
7047         {
7048           if (!slp_node)
7049             {
7050               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7051
7052               if (single_defuse_cycle && reduc_index == 0)
7053                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7054               else
7055                 vec_oprnds0[0]
7056                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7057               if (single_defuse_cycle && reduc_index == 1)
7058                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7059               else
7060                 vec_oprnds1[0]
7061                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7062               if (op_type == ternary_op)
7063                 {
7064                   if (single_defuse_cycle && reduc_index == 2)
7065                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7066                   else
7067                     vec_oprnds2[0]
7068                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7069                 }
7070             }
7071         }
7072
7073       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7074         {
7075           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7076           if (masked_loop_p)
7077             {
7078               /* Make sure that the reduction accumulator is vop[0].  */
7079               if (reduc_index == 1)
7080                 {
7081                   gcc_assert (commutative_tree_code (code));
7082                   std::swap (vop[0], vop[1]);
7083                 }
7084               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7085                                               vectype_in, i * ncopies + j);
7086               gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7087                                                         vop[0], vop[1]);
7088               new_temp = make_ssa_name (vec_dest, call);
7089               gimple_call_set_lhs (call, new_temp);
7090               gimple_call_set_nothrow (call, true);
7091               new_stmt = call;
7092             }
7093           else
7094             {
7095               if (op_type == ternary_op)
7096                 vop[2] = vec_oprnds2[i];
7097
7098               new_temp = make_ssa_name (vec_dest, new_stmt);
7099               new_stmt = gimple_build_assign (new_temp, code,
7100                                               vop[0], vop[1], vop[2]);
7101             }
7102           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7103
7104           if (slp_node)
7105             {
7106               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7107               vect_defs.quick_push (new_temp);
7108             }
7109           else
7110             vect_defs[0] = new_temp;
7111         }
7112
7113       if (slp_node)
7114         continue;
7115
7116       if (j == 0)
7117         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7118       else
7119         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7120
7121       prev_stmt_info = vinfo_for_stmt (new_stmt);
7122     }
7123
7124   /* Finalize the reduction-phi (set its arguments) and create the
7125      epilog reduction code.  */
7126   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7127     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7128
7129   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7130                                     epilog_copies, reduc_fn, phis,
7131                                     double_reduc, slp_node, slp_node_instance,
7132                                     cond_reduc_val, cond_reduc_op_code,
7133                                     neutral_op);
7134
7135   return true;
7136 }
7137
7138 /* Function vect_min_worthwhile_factor.
7139
7140    For a loop where we could vectorize the operation indicated by CODE,
7141    return the minimum vectorization factor that makes it worthwhile
7142    to use generic vectors.  */
7143 static unsigned int
7144 vect_min_worthwhile_factor (enum tree_code code)
7145 {
7146   switch (code)
7147     {
7148     case PLUS_EXPR:
7149     case MINUS_EXPR:
7150     case NEGATE_EXPR:
7151       return 4;
7152
7153     case BIT_AND_EXPR:
7154     case BIT_IOR_EXPR:
7155     case BIT_XOR_EXPR:
7156     case BIT_NOT_EXPR:
7157       return 2;
7158
7159     default:
7160       return INT_MAX;
7161     }
7162 }
7163
7164 /* Return true if VINFO indicates we are doing loop vectorization and if
7165    it is worth decomposing CODE operations into scalar operations for
7166    that loop's vectorization factor.  */
7167
7168 bool
7169 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7170 {
7171   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7172   unsigned HOST_WIDE_INT value;
7173   return (loop_vinfo
7174           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7175           && value >= vect_min_worthwhile_factor (code));
7176 }
7177
7178 /* Function vectorizable_induction
7179
7180    Check if PHI performs an induction computation that can be vectorized.
7181    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7182    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7183    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7184
7185 bool
7186 vectorizable_induction (gimple *phi,
7187                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7188                         gimple **vec_stmt, slp_tree slp_node)
7189 {
7190   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7191   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7192   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7193   unsigned ncopies;
7194   bool nested_in_vect_loop = false;
7195   struct loop *iv_loop;
7196   tree vec_def;
7197   edge pe = loop_preheader_edge (loop);
7198   basic_block new_bb;
7199   tree new_vec, vec_init, vec_step, t;
7200   tree new_name;
7201   gimple *new_stmt;
7202   gphi *induction_phi;
7203   tree induc_def, vec_dest;
7204   tree init_expr, step_expr;
7205   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7206   unsigned i;
7207   tree expr;
7208   gimple_seq stmts;
7209   imm_use_iterator imm_iter;
7210   use_operand_p use_p;
7211   gimple *exit_phi;
7212   edge latch_e;
7213   tree loop_arg;
7214   gimple_stmt_iterator si;
7215   basic_block bb = gimple_bb (phi);
7216
7217   if (gimple_code (phi) != GIMPLE_PHI)
7218     return false;
7219
7220   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7221     return false;
7222
7223   /* Make sure it was recognized as induction computation.  */
7224   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7225     return false;
7226
7227   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7228   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7229
7230   if (slp_node)
7231     ncopies = 1;
7232   else
7233     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7234   gcc_assert (ncopies >= 1);
7235
7236   /* FORNOW. These restrictions should be relaxed.  */
7237   if (nested_in_vect_loop_p (loop, phi))
7238     {
7239       imm_use_iterator imm_iter;
7240       use_operand_p use_p;
7241       gimple *exit_phi;
7242       edge latch_e;
7243       tree loop_arg;
7244
7245       if (ncopies > 1)
7246         {
7247           if (dump_enabled_p ())
7248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7249                              "multiple types in nested loop.\n");
7250           return false;
7251         }
7252
7253       /* FORNOW: outer loop induction with SLP not supported.  */
7254       if (STMT_SLP_TYPE (stmt_info))
7255         return false;
7256
7257       exit_phi = NULL;
7258       latch_e = loop_latch_edge (loop->inner);
7259       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7260       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7261         {
7262           gimple *use_stmt = USE_STMT (use_p);
7263           if (is_gimple_debug (use_stmt))
7264             continue;
7265
7266           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7267             {
7268               exit_phi = use_stmt;
7269               break;
7270             }
7271         }
7272       if (exit_phi)
7273         {
7274           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7275           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7276                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7277             {
7278               if (dump_enabled_p ())
7279                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7280                                  "inner-loop induction only used outside "
7281                                  "of the outer vectorized loop.\n");
7282               return false;
7283             }
7284         }
7285
7286       nested_in_vect_loop = true;
7287       iv_loop = loop->inner;
7288     }
7289   else
7290     iv_loop = loop;
7291   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7292
7293   if (slp_node && !nunits.is_constant ())
7294     {
7295       /* The current SLP code creates the initial value element-by-element.  */
7296       if (dump_enabled_p ())
7297         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7298                          "SLP induction not supported for variable-length"
7299                          " vectors.\n");
7300       return false;
7301     }
7302
7303   if (!vec_stmt) /* transformation not required.  */
7304     {
7305       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7306       if (dump_enabled_p ())
7307         dump_printf_loc (MSG_NOTE, vect_location,
7308                          "=== vectorizable_induction ===\n");
7309       vect_model_induction_cost (stmt_info, ncopies);
7310       return true;
7311     }
7312
7313   /* Transform.  */
7314
7315   /* Compute a vector variable, initialized with the first VF values of
7316      the induction variable.  E.g., for an iv with IV_PHI='X' and
7317      evolution S, for a vector of 4 units, we want to compute:
7318      [X, X + S, X + 2*S, X + 3*S].  */
7319
7320   if (dump_enabled_p ())
7321     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7322
7323   latch_e = loop_latch_edge (iv_loop);
7324   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7325
7326   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7327   gcc_assert (step_expr != NULL_TREE);
7328
7329   pe = loop_preheader_edge (iv_loop);
7330   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7331                                      loop_preheader_edge (iv_loop));
7332
7333   /* Convert the initial value and step to the desired type.  */
7334   stmts = NULL;
7335   init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7336   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7337
7338   /* If we are using the loop mask to "peel" for alignment then we need
7339      to adjust the start value here.  */
7340   tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7341   if (skip_niters != NULL_TREE)
7342     {
7343       if (FLOAT_TYPE_P (vectype))
7344         skip_niters = gimple_build (&stmts, FLOAT_EXPR, TREE_TYPE (vectype),
7345                                     skip_niters);
7346       else
7347         skip_niters = gimple_convert (&stmts, TREE_TYPE (vectype),
7348                                       skip_niters);
7349       tree skip_step = gimple_build (&stmts, MULT_EXPR, TREE_TYPE (vectype),
7350                                      skip_niters, step_expr);
7351       init_expr = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (vectype),
7352                                 init_expr, skip_step);
7353     }
7354
7355   if (stmts)
7356     {
7357       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7358       gcc_assert (!new_bb);
7359     }
7360
7361   /* Find the first insertion point in the BB.  */
7362   si = gsi_after_labels (bb);
7363
7364   /* For SLP induction we have to generate several IVs as for example
7365      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7366      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7367      [VF*S, VF*S, VF*S, VF*S] for all.  */
7368   if (slp_node)
7369     {
7370       /* Enforced above.  */
7371       unsigned int const_nunits = nunits.to_constant ();
7372
7373       /* Convert the init to the desired type.  */
7374       stmts = NULL;
7375       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7376       if (stmts)
7377         {
7378           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7379           gcc_assert (!new_bb);
7380         }
7381
7382       /* Generate [VF*S, VF*S, ... ].  */
7383       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7384         {
7385           expr = build_int_cst (integer_type_node, vf);
7386           expr = fold_convert (TREE_TYPE (step_expr), expr);
7387         }
7388       else
7389         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7390       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7391                               expr, step_expr);
7392       if (! CONSTANT_CLASS_P (new_name))
7393         new_name = vect_init_vector (phi, new_name,
7394                                      TREE_TYPE (step_expr), NULL);
7395       new_vec = build_vector_from_val (vectype, new_name);
7396       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7397
7398       /* Now generate the IVs.  */
7399       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7400       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7401       unsigned elts = const_nunits * nvects;
7402       unsigned nivs = least_common_multiple (group_size,
7403                                              const_nunits) / const_nunits;
7404       gcc_assert (elts % group_size == 0);
7405       tree elt = init_expr;
7406       unsigned ivn;
7407       for (ivn = 0; ivn < nivs; ++ivn)
7408         {
7409           tree_vector_builder elts (vectype, const_nunits, 1);
7410           stmts = NULL;
7411           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7412             {
7413               if (ivn*const_nunits + eltn >= group_size
7414                   && (ivn * const_nunits + eltn) % group_size == 0)
7415                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7416                                     elt, step_expr);
7417               elts.quick_push (elt);
7418             }
7419           vec_init = gimple_build_vector (&stmts, &elts);
7420           if (stmts)
7421             {
7422               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7423               gcc_assert (!new_bb);
7424             }
7425
7426           /* Create the induction-phi that defines the induction-operand.  */
7427           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7428           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7429           set_vinfo_for_stmt (induction_phi,
7430                               new_stmt_vec_info (induction_phi, loop_vinfo));
7431           induc_def = PHI_RESULT (induction_phi);
7432
7433           /* Create the iv update inside the loop  */
7434           vec_def = make_ssa_name (vec_dest);
7435           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7436           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7437           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7438
7439           /* Set the arguments of the phi node:  */
7440           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7441           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7442                        UNKNOWN_LOCATION);
7443
7444           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7445         }
7446
7447       /* Re-use IVs when we can.  */
7448       if (ivn < nvects)
7449         {
7450           unsigned vfp
7451             = least_common_multiple (group_size, const_nunits) / group_size;
7452           /* Generate [VF'*S, VF'*S, ... ].  */
7453           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7454             {
7455               expr = build_int_cst (integer_type_node, vfp);
7456               expr = fold_convert (TREE_TYPE (step_expr), expr);
7457             }
7458           else
7459             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7460           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7461                                   expr, step_expr);
7462           if (! CONSTANT_CLASS_P (new_name))
7463             new_name = vect_init_vector (phi, new_name,
7464                                          TREE_TYPE (step_expr), NULL);
7465           new_vec = build_vector_from_val (vectype, new_name);
7466           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7467           for (; ivn < nvects; ++ivn)
7468             {
7469               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7470               tree def;
7471               if (gimple_code (iv) == GIMPLE_PHI)
7472                 def = gimple_phi_result (iv);
7473               else
7474                 def = gimple_assign_lhs (iv);
7475               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7476                                               PLUS_EXPR,
7477                                               def, vec_step);
7478               if (gimple_code (iv) == GIMPLE_PHI)
7479                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7480               else
7481                 {
7482                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7483                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7484                 }
7485               set_vinfo_for_stmt (new_stmt,
7486                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7487               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7488             }
7489         }
7490
7491       return true;
7492     }
7493
7494   /* Create the vector that holds the initial_value of the induction.  */
7495   if (nested_in_vect_loop)
7496     {
7497       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7498          been created during vectorization of previous stmts.  We obtain it
7499          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7500       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7501       /* If the initial value is not of proper type, convert it.  */
7502       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7503         {
7504           new_stmt
7505             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7506                                                           vect_simple_var,
7507                                                           "vec_iv_"),
7508                                    VIEW_CONVERT_EXPR,
7509                                    build1 (VIEW_CONVERT_EXPR, vectype,
7510                                            vec_init));
7511           vec_init = gimple_assign_lhs (new_stmt);
7512           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7513                                                  new_stmt);
7514           gcc_assert (!new_bb);
7515           set_vinfo_for_stmt (new_stmt,
7516                               new_stmt_vec_info (new_stmt, loop_vinfo));
7517         }
7518     }
7519   else
7520     {
7521       /* iv_loop is the loop to be vectorized. Create:
7522          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7523       stmts = NULL;
7524       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7525
7526       unsigned HOST_WIDE_INT const_nunits;
7527       if (nunits.is_constant (&const_nunits))
7528         {
7529           tree_vector_builder elts (vectype, const_nunits, 1);
7530           elts.quick_push (new_name);
7531           for (i = 1; i < const_nunits; i++)
7532             {
7533               /* Create: new_name_i = new_name + step_expr  */
7534               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7535                                        new_name, step_expr);
7536               elts.quick_push (new_name);
7537             }
7538           /* Create a vector from [new_name_0, new_name_1, ...,
7539              new_name_nunits-1]  */
7540           vec_init = gimple_build_vector (&stmts, &elts);
7541         }
7542       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7543         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7544         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7545                                  new_name, step_expr);
7546       else
7547         {
7548           /* Build:
7549                 [base, base, base, ...]
7550                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7551           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7552           gcc_assert (flag_associative_math);
7553           tree index = build_index_vector (vectype, 0, 1);
7554           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7555                                                         new_name);
7556           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7557                                                         step_expr);
7558           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7559           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7560                                    vec_init, step_vec);
7561           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7562                                    vec_init, base_vec);
7563         }
7564
7565       if (stmts)
7566         {
7567           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7568           gcc_assert (!new_bb);
7569         }
7570     }
7571
7572
7573   /* Create the vector that holds the step of the induction.  */
7574   if (nested_in_vect_loop)
7575     /* iv_loop is nested in the loop to be vectorized. Generate:
7576        vec_step = [S, S, S, S]  */
7577     new_name = step_expr;
7578   else
7579     {
7580       /* iv_loop is the loop to be vectorized. Generate:
7581           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7582       gimple_seq seq = NULL;
7583       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7584         {
7585           expr = build_int_cst (integer_type_node, vf);
7586           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7587         }
7588       else
7589         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7590       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7591                                expr, step_expr);
7592       if (seq)
7593         {
7594           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7595           gcc_assert (!new_bb);
7596         }
7597     }
7598
7599   t = unshare_expr (new_name);
7600   gcc_assert (CONSTANT_CLASS_P (new_name)
7601               || TREE_CODE (new_name) == SSA_NAME);
7602   new_vec = build_vector_from_val (vectype, t);
7603   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7604
7605
7606   /* Create the following def-use cycle:
7607      loop prolog:
7608          vec_init = ...
7609          vec_step = ...
7610      loop:
7611          vec_iv = PHI <vec_init, vec_loop>
7612          ...
7613          STMT
7614          ...
7615          vec_loop = vec_iv + vec_step;  */
7616
7617   /* Create the induction-phi that defines the induction-operand.  */
7618   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7619   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7620   set_vinfo_for_stmt (induction_phi,
7621                       new_stmt_vec_info (induction_phi, loop_vinfo));
7622   induc_def = PHI_RESULT (induction_phi);
7623
7624   /* Create the iv update inside the loop  */
7625   vec_def = make_ssa_name (vec_dest);
7626   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7627   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7628   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7629
7630   /* Set the arguments of the phi node:  */
7631   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7632   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7633                UNKNOWN_LOCATION);
7634
7635   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7636
7637   /* In case that vectorization factor (VF) is bigger than the number
7638      of elements that we can fit in a vectype (nunits), we have to generate
7639      more than one vector stmt - i.e - we need to "unroll" the
7640      vector stmt by a factor VF/nunits.  For more details see documentation
7641      in vectorizable_operation.  */
7642
7643   if (ncopies > 1)
7644     {
7645       gimple_seq seq = NULL;
7646       stmt_vec_info prev_stmt_vinfo;
7647       /* FORNOW. This restriction should be relaxed.  */
7648       gcc_assert (!nested_in_vect_loop);
7649
7650       /* Create the vector that holds the step of the induction.  */
7651       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7652         {
7653           expr = build_int_cst (integer_type_node, nunits);
7654           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7655         }
7656       else
7657         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7658       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7659                                expr, step_expr);
7660       if (seq)
7661         {
7662           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7663           gcc_assert (!new_bb);
7664         }
7665
7666       t = unshare_expr (new_name);
7667       gcc_assert (CONSTANT_CLASS_P (new_name)
7668                   || TREE_CODE (new_name) == SSA_NAME);
7669       new_vec = build_vector_from_val (vectype, t);
7670       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7671
7672       vec_def = induc_def;
7673       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7674       for (i = 1; i < ncopies; i++)
7675         {
7676           /* vec_i = vec_prev + vec_step  */
7677           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7678                                           vec_def, vec_step);
7679           vec_def = make_ssa_name (vec_dest, new_stmt);
7680           gimple_assign_set_lhs (new_stmt, vec_def);
7681
7682           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7683           set_vinfo_for_stmt (new_stmt,
7684                               new_stmt_vec_info (new_stmt, loop_vinfo));
7685           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7686           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7687         }
7688     }
7689
7690   if (nested_in_vect_loop)
7691     {
7692       /* Find the loop-closed exit-phi of the induction, and record
7693          the final vector of induction results:  */
7694       exit_phi = NULL;
7695       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7696         {
7697           gimple *use_stmt = USE_STMT (use_p);
7698           if (is_gimple_debug (use_stmt))
7699             continue;
7700
7701           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7702             {
7703               exit_phi = use_stmt;
7704               break;
7705             }
7706         }
7707       if (exit_phi)
7708         {
7709           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7710           /* FORNOW. Currently not supporting the case that an inner-loop induction
7711              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7712           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7713                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7714
7715           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7716           if (dump_enabled_p ())
7717             {
7718               dump_printf_loc (MSG_NOTE, vect_location,
7719                                "vector of inductions after inner-loop:");
7720               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7721             }
7722         }
7723     }
7724
7725
7726   if (dump_enabled_p ())
7727     {
7728       dump_printf_loc (MSG_NOTE, vect_location,
7729                        "transform induction: created def-use cycle: ");
7730       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7731       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7732                         SSA_NAME_DEF_STMT (vec_def), 0);
7733     }
7734
7735   return true;
7736 }
7737
7738 /* Function vectorizable_live_operation.
7739
7740    STMT computes a value that is used outside the loop.  Check if
7741    it can be supported.  */
7742
7743 bool
7744 vectorizable_live_operation (gimple *stmt,
7745                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7746                              slp_tree slp_node, int slp_index,
7747                              gimple **vec_stmt)
7748 {
7749   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7750   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7751   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7752   imm_use_iterator imm_iter;
7753   tree lhs, lhs_type, bitsize, vec_bitsize;
7754   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7755   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7756   int ncopies;
7757   gimple *use_stmt;
7758   auto_vec<tree> vec_oprnds;
7759   int vec_entry = 0;
7760   poly_uint64 vec_index = 0;
7761
7762   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7763
7764   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7765     return false;
7766
7767   /* FORNOW.  CHECKME.  */
7768   if (nested_in_vect_loop_p (loop, stmt))
7769     return false;
7770
7771   /* If STMT is not relevant and it is a simple assignment and its inputs are
7772      invariant then it can remain in place, unvectorized.  The original last
7773      scalar value that it computes will be used.  */
7774   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7775     {
7776       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7777       if (dump_enabled_p ())
7778         dump_printf_loc (MSG_NOTE, vect_location,
7779                          "statement is simple and uses invariant.  Leaving in "
7780                          "place.\n");
7781       return true;
7782     }
7783
7784   if (slp_node)
7785     ncopies = 1;
7786   else
7787     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7788
7789   if (slp_node)
7790     {
7791       gcc_assert (slp_index >= 0);
7792
7793       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7794       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7795
7796       /* Get the last occurrence of the scalar index from the concatenation of
7797          all the slp vectors. Calculate which slp vector it is and the index
7798          within.  */
7799       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7800
7801       /* Calculate which vector contains the result, and which lane of
7802          that vector we need.  */
7803       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7804         {
7805           if (dump_enabled_p ())
7806             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7807                              "Cannot determine which vector holds the"
7808                              " final result.\n");
7809           return false;
7810         }
7811     }
7812
7813   if (!vec_stmt)
7814     {
7815       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7816         {
7817           if (dump_enabled_p ())
7818             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7819                              "can't use a fully-masked loop because "
7820                              "a value is live outside the loop.\n");
7821           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7822         }
7823
7824       /* No transformation required.  */
7825       return true;
7826     }
7827
7828   /* If stmt has a related stmt, then use that for getting the lhs.  */
7829   if (is_pattern_stmt_p (stmt_info))
7830     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7831
7832   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7833         : gimple_get_lhs (stmt);
7834   lhs_type = TREE_TYPE (lhs);
7835
7836   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7837              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7838              : TYPE_SIZE (TREE_TYPE (vectype)));
7839   vec_bitsize = TYPE_SIZE (vectype);
7840
7841   gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7842
7843   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7844   tree vec_lhs, bitstart;
7845   if (slp_node)
7846     {
7847       /* Get the correct slp vectorized stmt.  */
7848       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7849
7850       /* Get entry to use.  */
7851       bitstart = bitsize_int (vec_index);
7852       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7853     }
7854   else
7855     {
7856       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7857       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7858
7859       /* For multiple copies, get the last copy.  */
7860       for (int i = 1; i < ncopies; ++i)
7861         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7862                                                   vec_lhs);
7863
7864       /* Get the last lane in the vector.  */
7865       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7866     }
7867
7868   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7869      loop.  */
7870   gimple_seq stmts = NULL;
7871   tree bftype = TREE_TYPE (vectype);
7872   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7873     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7874   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7875   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7876                                    true, NULL_TREE);
7877   if (stmts)
7878     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7879
7880   /* Replace use of lhs with newly computed result.  If the use stmt is a
7881      single arg PHI, just replace all uses of PHI result.  It's necessary
7882      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7883   use_operand_p use_p;
7884   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7885     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7886         && !is_gimple_debug (use_stmt))
7887     {
7888       if (gimple_code (use_stmt) == GIMPLE_PHI
7889           && gimple_phi_num_args (use_stmt) == 1)
7890         {
7891           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7892         }
7893       else
7894         {
7895           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7896             SET_USE (use_p, new_tree);
7897         }
7898       update_stmt (use_stmt);
7899     }
7900
7901   return true;
7902 }
7903
7904 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7905
7906 static void
7907 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7908 {
7909   ssa_op_iter op_iter;
7910   imm_use_iterator imm_iter;
7911   def_operand_p def_p;
7912   gimple *ustmt;
7913
7914   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7915     {
7916       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7917         {
7918           basic_block bb;
7919
7920           if (!is_gimple_debug (ustmt))
7921             continue;
7922
7923           bb = gimple_bb (ustmt);
7924
7925           if (!flow_bb_inside_loop_p (loop, bb))
7926             {
7927               if (gimple_debug_bind_p (ustmt))
7928                 {
7929                   if (dump_enabled_p ())
7930                     dump_printf_loc (MSG_NOTE, vect_location,
7931                                      "killing debug use\n");
7932
7933                   gimple_debug_bind_reset_value (ustmt);
7934                   update_stmt (ustmt);
7935                 }
7936               else
7937                 gcc_unreachable ();
7938             }
7939         }
7940     }
7941 }
7942
7943 /* Given loop represented by LOOP_VINFO, return true if computation of
7944    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7945    otherwise.  */
7946
7947 static bool
7948 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7949 {
7950   /* Constant case.  */
7951   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7952     {
7953       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7954       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7955
7956       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7957       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7958       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7959         return true;
7960     }
7961
7962   widest_int max;
7963   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7964   /* Check the upper bound of loop niters.  */
7965   if (get_max_loop_iterations (loop, &max))
7966     {
7967       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7968       signop sgn = TYPE_SIGN (type);
7969       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7970       if (max < type_max)
7971         return true;
7972     }
7973   return false;
7974 }
7975
7976 /* Return a mask type with half the number of elements as TYPE.  */
7977
7978 tree
7979 vect_halve_mask_nunits (tree type)
7980 {
7981   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
7982   return build_truth_vector_type (nunits, current_vector_size);
7983 }
7984
7985 /* Return a mask type with twice as many elements as TYPE.  */
7986
7987 tree
7988 vect_double_mask_nunits (tree type)
7989 {
7990   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
7991   return build_truth_vector_type (nunits, current_vector_size);
7992 }
7993
7994 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7995    contain a sequence of NVECTORS masks that each control a vector of type
7996    VECTYPE.  */
7997
7998 void
7999 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8000                        unsigned int nvectors, tree vectype)
8001 {
8002   gcc_assert (nvectors != 0);
8003   if (masks->length () < nvectors)
8004     masks->safe_grow_cleared (nvectors);
8005   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8006   /* The number of scalars per iteration and the number of vectors are
8007      both compile-time constants.  */
8008   unsigned int nscalars_per_iter
8009     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8010                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8011   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8012     {
8013       rgm->max_nscalars_per_iter = nscalars_per_iter;
8014       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8015     }
8016 }
8017
8018 /* Given a complete set of masks MASKS, extract mask number INDEX
8019    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8020    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8021
8022    See the comment above vec_loop_masks for more details about the mask
8023    arrangement.  */
8024
8025 tree
8026 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8027                     unsigned int nvectors, tree vectype, unsigned int index)
8028 {
8029   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8030   tree mask_type = rgm->mask_type;
8031
8032   /* Populate the rgroup's mask array, if this is the first time we've
8033      used it.  */
8034   if (rgm->masks.is_empty ())
8035     {
8036       rgm->masks.safe_grow_cleared (nvectors);
8037       for (unsigned int i = 0; i < nvectors; ++i)
8038         {
8039           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8040           /* Provide a dummy definition until the real one is available.  */
8041           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8042           rgm->masks[i] = mask;
8043         }
8044     }
8045
8046   tree mask = rgm->masks[index];
8047   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8048                 TYPE_VECTOR_SUBPARTS (vectype)))
8049     {
8050       /* A loop mask for data type X can be reused for data type Y
8051          if X has N times more elements than Y and if Y's elements
8052          are N times bigger than X's.  In this case each sequence
8053          of N elements in the loop mask will be all-zero or all-one.
8054          We can then view-convert the mask so that each sequence of
8055          N elements is replaced by a single element.  */
8056       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8057                               TYPE_VECTOR_SUBPARTS (vectype)));
8058       gimple_seq seq = NULL;
8059       mask_type = build_same_sized_truth_vector_type (vectype);
8060       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8061       if (seq)
8062         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8063     }
8064   return mask;
8065 }
8066
8067 /* Scale profiling counters by estimation for LOOP which is vectorized
8068    by factor VF.  */
8069
8070 static void
8071 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8072 {
8073   edge preheader = loop_preheader_edge (loop);
8074   /* Reduce loop iterations by the vectorization factor.  */
8075   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8076   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8077
8078   if (freq_h.nonzero_p ())
8079     {
8080       profile_probability p;
8081
8082       /* Avoid dropping loop body profile counter to 0 because of zero count
8083          in loop's preheader.  */
8084       if (!(freq_e == profile_count::zero ()))
8085         freq_e = freq_e.force_nonzero ();
8086       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8087       scale_loop_frequencies (loop, p);
8088     }
8089
8090   edge exit_e = single_exit (loop);
8091   exit_e->probability = profile_probability::always ()
8092                                  .apply_scale (1, new_est_niter + 1);
8093
8094   edge exit_l = single_pred_edge (loop->latch);
8095   profile_probability prob = exit_l->probability;
8096   exit_l->probability = exit_e->probability.invert ();
8097   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8098     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8099 }
8100
8101 /* Function vect_transform_loop.
8102
8103    The analysis phase has determined that the loop is vectorizable.
8104    Vectorize the loop - created vectorized stmts to replace the scalar
8105    stmts in the loop, and update the loop exit condition.
8106    Returns scalar epilogue loop if any.  */
8107
8108 struct loop *
8109 vect_transform_loop (loop_vec_info loop_vinfo)
8110 {
8111   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8112   struct loop *epilogue = NULL;
8113   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8114   int nbbs = loop->num_nodes;
8115   int i;
8116   tree niters_vector = NULL_TREE;
8117   tree step_vector = NULL_TREE;
8118   tree niters_vector_mult_vf = NULL_TREE;
8119   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8120   unsigned int lowest_vf = constant_lower_bound (vf);
8121   bool grouped_store;
8122   bool slp_scheduled = false;
8123   gimple *stmt, *pattern_stmt;
8124   gimple_seq pattern_def_seq = NULL;
8125   gimple_stmt_iterator pattern_def_si = gsi_none ();
8126   bool transform_pattern_stmt = false;
8127   bool check_profitability = false;
8128   unsigned int th;
8129
8130   if (dump_enabled_p ())
8131     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8132
8133   /* Use the more conservative vectorization threshold.  If the number
8134      of iterations is constant assume the cost check has been performed
8135      by our caller.  If the threshold makes all loops profitable that
8136      run at least the (estimated) vectorization factor number of times
8137      checking is pointless, too.  */
8138   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8139   if (th >= vect_vf_for_cost (loop_vinfo)
8140       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8141     {
8142       if (dump_enabled_p ())
8143         dump_printf_loc (MSG_NOTE, vect_location,
8144                          "Profitability threshold is %d loop iterations.\n",
8145                          th);
8146       check_profitability = true;
8147     }
8148
8149   /* Make sure there exists a single-predecessor exit bb.  Do this before
8150      versioning.   */
8151   edge e = single_exit (loop);
8152   if (! single_pred_p (e->dest))
8153     {
8154       split_loop_exit_edge (e);
8155       if (dump_enabled_p ())
8156         dump_printf (MSG_NOTE, "split exit edge\n");
8157     }
8158
8159   /* Version the loop first, if required, so the profitability check
8160      comes first.  */
8161
8162   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8163     {
8164       poly_uint64 versioning_threshold
8165         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8166       if (check_profitability
8167           && ordered_p (poly_uint64 (th), versioning_threshold))
8168         {
8169           versioning_threshold = ordered_max (poly_uint64 (th),
8170                                               versioning_threshold);
8171           check_profitability = false;
8172         }
8173       vect_loop_versioning (loop_vinfo, th, check_profitability,
8174                             versioning_threshold);
8175       check_profitability = false;
8176     }
8177
8178   /* Make sure there exists a single-predecessor exit bb also on the
8179      scalar loop copy.  Do this after versioning but before peeling
8180      so CFG structure is fine for both scalar and if-converted loop
8181      to make slpeel_duplicate_current_defs_from_edges face matched
8182      loop closed PHI nodes on the exit.  */
8183   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8184     {
8185       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8186       if (! single_pred_p (e->dest))
8187         {
8188           split_loop_exit_edge (e);
8189           if (dump_enabled_p ())
8190             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8191         }
8192     }
8193
8194   tree niters = vect_build_loop_niters (loop_vinfo);
8195   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8196   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8197   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8198   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8199                               &step_vector, &niters_vector_mult_vf, th,
8200                               check_profitability, niters_no_overflow);
8201
8202   if (niters_vector == NULL_TREE)
8203     {
8204       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8205           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8206           && known_eq (lowest_vf, vf))
8207         {
8208           niters_vector
8209             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8210                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8211           step_vector = build_one_cst (TREE_TYPE (niters));
8212         }
8213       else
8214         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8215                                      &step_vector, niters_no_overflow);
8216     }
8217
8218   /* 1) Make sure the loop header has exactly two entries
8219      2) Make sure we have a preheader basic block.  */
8220
8221   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8222
8223   split_edge (loop_preheader_edge (loop));
8224
8225   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8226       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8227     /* This will deal with any possible peeling.  */
8228     vect_prepare_for_masked_peels (loop_vinfo);
8229
8230   /* FORNOW: the vectorizer supports only loops which body consist
8231      of one basic block (header + empty latch). When the vectorizer will
8232      support more involved loop forms, the order by which the BBs are
8233      traversed need to be reconsidered.  */
8234
8235   for (i = 0; i < nbbs; i++)
8236     {
8237       basic_block bb = bbs[i];
8238       stmt_vec_info stmt_info;
8239
8240       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8241            gsi_next (&si))
8242         {
8243           gphi *phi = si.phi ();
8244           if (dump_enabled_p ())
8245             {
8246               dump_printf_loc (MSG_NOTE, vect_location,
8247                                "------>vectorizing phi: ");
8248               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8249             }
8250           stmt_info = vinfo_for_stmt (phi);
8251           if (!stmt_info)
8252             continue;
8253
8254           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8255             vect_loop_kill_debug_uses (loop, phi);
8256
8257           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8258               && !STMT_VINFO_LIVE_P (stmt_info))
8259             continue;
8260
8261           if (STMT_VINFO_VECTYPE (stmt_info)
8262               && (maybe_ne
8263                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8264               && dump_enabled_p ())
8265             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8266
8267           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8268                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8269                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8270               && ! PURE_SLP_STMT (stmt_info))
8271             {
8272               if (dump_enabled_p ())
8273                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8274               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8275             }
8276         }
8277
8278       pattern_stmt = NULL;
8279       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8280            !gsi_end_p (si) || transform_pattern_stmt;)
8281         {
8282           bool is_store;
8283
8284           if (transform_pattern_stmt)
8285             stmt = pattern_stmt;
8286           else
8287             {
8288               stmt = gsi_stmt (si);
8289               /* During vectorization remove existing clobber stmts.  */
8290               if (gimple_clobber_p (stmt))
8291                 {
8292                   unlink_stmt_vdef (stmt);
8293                   gsi_remove (&si, true);
8294                   release_defs (stmt);
8295                   continue;
8296                 }
8297             }
8298
8299           if (dump_enabled_p ())
8300             {
8301               dump_printf_loc (MSG_NOTE, vect_location,
8302                                "------>vectorizing statement: ");
8303               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8304             }
8305
8306           stmt_info = vinfo_for_stmt (stmt);
8307
8308           /* vector stmts created in the outer-loop during vectorization of
8309              stmts in an inner-loop may not have a stmt_info, and do not
8310              need to be vectorized.  */
8311           if (!stmt_info)
8312             {
8313               gsi_next (&si);
8314               continue;
8315             }
8316
8317           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8318             vect_loop_kill_debug_uses (loop, stmt);
8319
8320           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8321               && !STMT_VINFO_LIVE_P (stmt_info))
8322             {
8323               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8324                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8325                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8326                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8327                 {
8328                   stmt = pattern_stmt;
8329                   stmt_info = vinfo_for_stmt (stmt);
8330                 }
8331               else
8332                 {
8333                   gsi_next (&si);
8334                   continue;
8335                 }
8336             }
8337           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8338                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8339                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8340                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8341             transform_pattern_stmt = true;
8342
8343           /* If pattern statement has def stmts, vectorize them too.  */
8344           if (is_pattern_stmt_p (stmt_info))
8345             {
8346               if (pattern_def_seq == NULL)
8347                 {
8348                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8349                   pattern_def_si = gsi_start (pattern_def_seq);
8350                 }
8351               else if (!gsi_end_p (pattern_def_si))
8352                 gsi_next (&pattern_def_si);
8353               if (pattern_def_seq != NULL)
8354                 {
8355                   gimple *pattern_def_stmt = NULL;
8356                   stmt_vec_info pattern_def_stmt_info = NULL;
8357
8358                   while (!gsi_end_p (pattern_def_si))
8359                     {
8360                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8361                       pattern_def_stmt_info
8362                         = vinfo_for_stmt (pattern_def_stmt);
8363                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8364                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8365                         break;
8366                       gsi_next (&pattern_def_si);
8367                     }
8368
8369                   if (!gsi_end_p (pattern_def_si))
8370                     {
8371                       if (dump_enabled_p ())
8372                         {
8373                           dump_printf_loc (MSG_NOTE, vect_location,
8374                                            "==> vectorizing pattern def "
8375                                            "stmt: ");
8376                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8377                                             pattern_def_stmt, 0);
8378                         }
8379
8380                       stmt = pattern_def_stmt;
8381                       stmt_info = pattern_def_stmt_info;
8382                     }
8383                   else
8384                     {
8385                       pattern_def_si = gsi_none ();
8386                       transform_pattern_stmt = false;
8387                     }
8388                 }
8389               else
8390                 transform_pattern_stmt = false;
8391             }
8392
8393           if (STMT_VINFO_VECTYPE (stmt_info))
8394             {
8395               poly_uint64 nunits
8396                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8397               if (!STMT_SLP_TYPE (stmt_info)
8398                   && maybe_ne (nunits, vf)
8399                   && dump_enabled_p ())
8400                   /* For SLP VF is set according to unrolling factor, and not
8401                      to vector size, hence for SLP this print is not valid.  */
8402                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8403             }
8404
8405           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8406              reached.  */
8407           if (STMT_SLP_TYPE (stmt_info))
8408             {
8409               if (!slp_scheduled)
8410                 {
8411                   slp_scheduled = true;
8412
8413                   if (dump_enabled_p ())
8414                     dump_printf_loc (MSG_NOTE, vect_location,
8415                                      "=== scheduling SLP instances ===\n");
8416
8417                   vect_schedule_slp (loop_vinfo);
8418                 }
8419
8420               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8421               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8422                 {
8423                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8424                     {
8425                       pattern_def_seq = NULL;
8426                       gsi_next (&si);
8427                     }
8428                   continue;
8429                 }
8430             }
8431
8432           /* -------- vectorize statement ------------ */
8433           if (dump_enabled_p ())
8434             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8435
8436           grouped_store = false;
8437           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8438           if (is_store)
8439             {
8440               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8441                 {
8442                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8443                      interleaving chain was completed - free all the stores in
8444                      the chain.  */
8445                   gsi_next (&si);
8446                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8447                 }
8448               else
8449                 {
8450                   /* Free the attached stmt_vec_info and remove the stmt.  */
8451                   gimple *store = gsi_stmt (si);
8452                   free_stmt_vec_info (store);
8453                   unlink_stmt_vdef (store);
8454                   gsi_remove (&si, true);
8455                   release_defs (store);
8456                 }
8457
8458               /* Stores can only appear at the end of pattern statements.  */
8459               gcc_assert (!transform_pattern_stmt);
8460               pattern_def_seq = NULL;
8461             }
8462           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8463             {
8464               pattern_def_seq = NULL;
8465               gsi_next (&si);
8466             }
8467         }                       /* stmts in BB */
8468
8469       /* Stub out scalar statements that must not survive vectorization.
8470          Doing this here helps with grouped statements, or statements that
8471          are involved in patterns.  */
8472       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8473            !gsi_end_p (gsi); gsi_next (&gsi))
8474         {
8475           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8476           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8477             {
8478               tree lhs = gimple_get_lhs (call);
8479               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8480                 {
8481                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8482                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8483                   gsi_replace (&gsi, new_stmt, true);
8484                 }
8485             }
8486         }
8487     }                           /* BBs in loop */
8488
8489   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8490      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8491   if (integer_onep (step_vector))
8492     niters_no_overflow = true;
8493   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8494                            niters_vector_mult_vf, !niters_no_overflow);
8495
8496   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8497   scale_profile_for_vect_loop (loop, assumed_vf);
8498
8499   /* True if the final iteration might not handle a full vector's
8500      worth of scalar iterations.  */
8501   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8502   /* The minimum number of iterations performed by the epilogue.  This
8503      is 1 when peeling for gaps because we always need a final scalar
8504      iteration.  */
8505   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8506   /* +1 to convert latch counts to loop iteration counts,
8507      -min_epilogue_iters to remove iterations that cannot be performed
8508        by the vector code.  */
8509   int bias_for_lowest = 1 - min_epilogue_iters;
8510   int bias_for_assumed = bias_for_lowest;
8511   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8512   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8513     {
8514       /* When the amount of peeling is known at compile time, the first
8515          iteration will have exactly alignment_npeels active elements.
8516          In the worst case it will have at least one.  */
8517       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8518       bias_for_lowest += lowest_vf - min_first_active;
8519       bias_for_assumed += assumed_vf - min_first_active;
8520     }
8521   /* In these calculations the "- 1" converts loop iteration counts
8522      back to latch counts.  */
8523   if (loop->any_upper_bound)
8524     loop->nb_iterations_upper_bound
8525       = (final_iter_may_be_partial
8526          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8527                           lowest_vf) - 1
8528          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8529                            lowest_vf) - 1);
8530   if (loop->any_likely_upper_bound)
8531     loop->nb_iterations_likely_upper_bound
8532       = (final_iter_may_be_partial
8533          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8534                           + bias_for_lowest, lowest_vf) - 1
8535          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8536                            + bias_for_lowest, lowest_vf) - 1);
8537   if (loop->any_estimate)
8538     loop->nb_iterations_estimate
8539       = (final_iter_may_be_partial
8540          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8541                           assumed_vf) - 1
8542          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8543                            assumed_vf) - 1);
8544
8545   if (dump_enabled_p ())
8546     {
8547       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8548         {
8549           dump_printf_loc (MSG_NOTE, vect_location,
8550                            "LOOP VECTORIZED\n");
8551           if (loop->inner)
8552             dump_printf_loc (MSG_NOTE, vect_location,
8553                              "OUTER LOOP VECTORIZED\n");
8554           dump_printf (MSG_NOTE, "\n");
8555         }
8556       else
8557         {
8558           dump_printf_loc (MSG_NOTE, vect_location,
8559                            "LOOP EPILOGUE VECTORIZED (VS=");
8560           dump_dec (MSG_NOTE, current_vector_size);
8561           dump_printf (MSG_NOTE, ")\n");
8562         }
8563     }
8564
8565   /* Free SLP instances here because otherwise stmt reference counting
8566      won't work.  */
8567   slp_instance instance;
8568   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8569     vect_free_slp_instance (instance);
8570   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8571   /* Clear-up safelen field since its value is invalid after vectorization
8572      since vectorized loop can have loop-carried dependencies.  */
8573   loop->safelen = 0;
8574
8575   /* Don't vectorize epilogue for epilogue.  */
8576   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8577     epilogue = NULL;
8578
8579   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8580     epilogue = NULL;
8581
8582   if (epilogue)
8583     {
8584       auto_vector_sizes vector_sizes;
8585       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8586       unsigned int next_size = 0;
8587
8588       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8589           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8590           && known_eq (vf, lowest_vf))
8591         {
8592           unsigned int eiters
8593             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8594                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8595           eiters = eiters % lowest_vf;
8596           epilogue->nb_iterations_upper_bound = eiters - 1;
8597
8598           unsigned int ratio;
8599           while (next_size < vector_sizes.length ()
8600                  && !(constant_multiple_p (current_vector_size,
8601                                            vector_sizes[next_size], &ratio)
8602                       && eiters >= lowest_vf / ratio))
8603             next_size += 1;
8604         }
8605       else
8606         while (next_size < vector_sizes.length ()
8607                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8608           next_size += 1;
8609
8610       if (next_size == vector_sizes.length ())
8611         epilogue = NULL;
8612     }
8613
8614   if (epilogue)
8615     {
8616       epilogue->force_vectorize = loop->force_vectorize;
8617       epilogue->safelen = loop->safelen;
8618       epilogue->dont_vectorize = false;
8619
8620       /* We may need to if-convert epilogue to vectorize it.  */
8621       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8622         tree_if_conversion (epilogue);
8623     }
8624
8625   return epilogue;
8626 }
8627
8628 /* The code below is trying to perform simple optimization - revert
8629    if-conversion for masked stores, i.e. if the mask of a store is zero
8630    do not perform it and all stored value producers also if possible.
8631    For example,
8632      for (i=0; i<n; i++)
8633        if (c[i])
8634         {
8635           p1[i] += 1;
8636           p2[i] = p3[i] +2;
8637         }
8638    this transformation will produce the following semi-hammock:
8639
8640    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8641      {
8642        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8643        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8644        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8645        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8646        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8647        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8648      }
8649 */
8650
8651 void
8652 optimize_mask_stores (struct loop *loop)
8653 {
8654   basic_block *bbs = get_loop_body (loop);
8655   unsigned nbbs = loop->num_nodes;
8656   unsigned i;
8657   basic_block bb;
8658   struct loop *bb_loop;
8659   gimple_stmt_iterator gsi;
8660   gimple *stmt;
8661   auto_vec<gimple *> worklist;
8662
8663   vect_location = find_loop_location (loop);
8664   /* Pick up all masked stores in loop if any.  */
8665   for (i = 0; i < nbbs; i++)
8666     {
8667       bb = bbs[i];
8668       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8669            gsi_next (&gsi))
8670         {
8671           stmt = gsi_stmt (gsi);
8672           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8673             worklist.safe_push (stmt);
8674         }
8675     }
8676
8677   free (bbs);
8678   if (worklist.is_empty ())
8679     return;
8680
8681   /* Loop has masked stores.  */
8682   while (!worklist.is_empty ())
8683     {
8684       gimple *last, *last_store;
8685       edge e, efalse;
8686       tree mask;
8687       basic_block store_bb, join_bb;
8688       gimple_stmt_iterator gsi_to;
8689       tree vdef, new_vdef;
8690       gphi *phi;
8691       tree vectype;
8692       tree zero;
8693
8694       last = worklist.pop ();
8695       mask = gimple_call_arg (last, 2);
8696       bb = gimple_bb (last);
8697       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8698          the same loop as if_bb.  It could be different to LOOP when two
8699          level loop-nest is vectorized and mask_store belongs to the inner
8700          one.  */
8701       e = split_block (bb, last);
8702       bb_loop = bb->loop_father;
8703       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8704       join_bb = e->dest;
8705       store_bb = create_empty_bb (bb);
8706       add_bb_to_loop (store_bb, bb_loop);
8707       e->flags = EDGE_TRUE_VALUE;
8708       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8709       /* Put STORE_BB to likely part.  */
8710       efalse->probability = profile_probability::unlikely ();
8711       store_bb->count = efalse->count ();
8712       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8713       if (dom_info_available_p (CDI_DOMINATORS))
8714         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8715       if (dump_enabled_p ())
8716         dump_printf_loc (MSG_NOTE, vect_location,
8717                          "Create new block %d to sink mask stores.",
8718                          store_bb->index);
8719       /* Create vector comparison with boolean result.  */
8720       vectype = TREE_TYPE (mask);
8721       zero = build_zero_cst (vectype);
8722       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8723       gsi = gsi_last_bb (bb);
8724       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8725       /* Create new PHI node for vdef of the last masked store:
8726          .MEM_2 = VDEF <.MEM_1>
8727          will be converted to
8728          .MEM.3 = VDEF <.MEM_1>
8729          and new PHI node will be created in join bb
8730          .MEM_2 = PHI <.MEM_1, .MEM_3>
8731       */
8732       vdef = gimple_vdef (last);
8733       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8734       gimple_set_vdef (last, new_vdef);
8735       phi = create_phi_node (vdef, join_bb);
8736       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8737
8738       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8739       while (true)
8740         {
8741           gimple_stmt_iterator gsi_from;
8742           gimple *stmt1 = NULL;
8743
8744           /* Move masked store to STORE_BB.  */
8745           last_store = last;
8746           gsi = gsi_for_stmt (last);
8747           gsi_from = gsi;
8748           /* Shift GSI to the previous stmt for further traversal.  */
8749           gsi_prev (&gsi);
8750           gsi_to = gsi_start_bb (store_bb);
8751           gsi_move_before (&gsi_from, &gsi_to);
8752           /* Setup GSI_TO to the non-empty block start.  */
8753           gsi_to = gsi_start_bb (store_bb);
8754           if (dump_enabled_p ())
8755             {
8756               dump_printf_loc (MSG_NOTE, vect_location,
8757                                "Move stmt to created bb\n");
8758               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8759             }
8760           /* Move all stored value producers if possible.  */
8761           while (!gsi_end_p (gsi))
8762             {
8763               tree lhs;
8764               imm_use_iterator imm_iter;
8765               use_operand_p use_p;
8766               bool res;
8767
8768               /* Skip debug statements.  */
8769               if (is_gimple_debug (gsi_stmt (gsi)))
8770                 {
8771                   gsi_prev (&gsi);
8772                   continue;
8773                 }
8774               stmt1 = gsi_stmt (gsi);
8775               /* Do not consider statements writing to memory or having
8776                  volatile operand.  */
8777               if (gimple_vdef (stmt1)
8778                   || gimple_has_volatile_ops (stmt1))
8779                 break;
8780               gsi_from = gsi;
8781               gsi_prev (&gsi);
8782               lhs = gimple_get_lhs (stmt1);
8783               if (!lhs)
8784                 break;
8785
8786               /* LHS of vectorized stmt must be SSA_NAME.  */
8787               if (TREE_CODE (lhs) != SSA_NAME)
8788                 break;
8789
8790               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8791                 {
8792                   /* Remove dead scalar statement.  */
8793                   if (has_zero_uses (lhs))
8794                     {
8795                       gsi_remove (&gsi_from, true);
8796                       continue;
8797                     }
8798                 }
8799
8800               /* Check that LHS does not have uses outside of STORE_BB.  */
8801               res = true;
8802               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8803                 {
8804                   gimple *use_stmt;
8805                   use_stmt = USE_STMT (use_p);
8806                   if (is_gimple_debug (use_stmt))
8807                     continue;
8808                   if (gimple_bb (use_stmt) != store_bb)
8809                     {
8810                       res = false;
8811                       break;
8812                     }
8813                 }
8814               if (!res)
8815                 break;
8816
8817               if (gimple_vuse (stmt1)
8818                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8819                 break;
8820
8821               /* Can move STMT1 to STORE_BB.  */
8822               if (dump_enabled_p ())
8823                 {
8824                   dump_printf_loc (MSG_NOTE, vect_location,
8825                                    "Move stmt to created bb\n");
8826                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8827                 }
8828               gsi_move_before (&gsi_from, &gsi_to);
8829               /* Shift GSI_TO for further insertion.  */
8830               gsi_prev (&gsi_to);
8831             }
8832           /* Put other masked stores with the same mask to STORE_BB.  */
8833           if (worklist.is_empty ()
8834               || gimple_call_arg (worklist.last (), 2) != mask
8835               || worklist.last () != stmt1)
8836             break;
8837           last = worklist.pop ();
8838         }
8839       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8840     }
8841 }