gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156
 157 /* Function vect_determine_vectorization_factor
 158
 159    Determine the vectorization factor (VF).  VF is the number of data elements
 160    that are operated upon in parallel in a single iteration of the vectorized
 161    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 162    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 163    elements can fit in a single vector register.
 164
 165    We currently support vectorization of loops in which all types operated upon
 166    are of the same size.  Therefore this function currently sets VF according to
 167    the size of the types operated upon, and fails if there are multiple sizes
 168    in the loop.
 169
 170    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 171    original loop:
 172         for (i=0; i<N; i++){
 173           a[i] = b[i] + c[i];
 174         }
 175
 176    vectorized loop:
 177         for (i=0; i<N; i+=VF){
 178           a[i:VF] = b[i:VF] + c[i:VF];
 179         }
 180 */
 181
 182 static bool
 183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 187   unsigned nbbs = loop->num_nodes;
 188   poly_uint64 vectorization_factor = 1;
 189   tree scalar_type = NULL_TREE;
 190   gphi *phi;
 191   tree vectype;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               if (dump_enabled_p ())
 261                 {
 262                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 263                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 264                   dump_printf (MSG_NOTE, "\n");
 265                 }
 266
 267               vect_update_max_nunits (&vectorization_factor, vectype);
 268             }
 269         }
 270
 271       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 272            !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288             }
 289
 290           gcc_assert (stmt_info);
 291
 292           /* Skip stmts which do not need to be vectorized.  */
 293           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 294                && !STMT_VINFO_LIVE_P (stmt_info))
 295               || gimple_clobber_p (stmt))
 296             {
 297               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 298                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 299                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 300                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 301                 {
 302                   stmt = pattern_stmt;
 303                   stmt_info = vinfo_for_stmt (pattern_stmt);
 304                   if (dump_enabled_p ())
 305                     {
 306                       dump_printf_loc (MSG_NOTE, vect_location,
 307                                        "==> examining pattern statement: ");
 308                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 309                     }
 310                 }
 311               else
 312                 {
 313                   if (dump_enabled_p ())
 314                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 315                   gsi_next (&si);
 316                   continue;
 317                 }
 318             }
 319           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 320                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 321                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 322                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 323             analyze_pattern_stmt = true;
 324
 325           /* If a pattern statement has def stmts, analyze them too.  */
 326           if (is_pattern_stmt_p (stmt_info))
 327             {
 328               if (pattern_def_seq == NULL)
 329                 {
 330                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 331                   pattern_def_si = gsi_start (pattern_def_seq);
 332                 }
 333               else if (!gsi_end_p (pattern_def_si))
 334                 gsi_next (&pattern_def_si);
 335               if (pattern_def_seq != NULL)
 336                 {
 337                   gimple *pattern_def_stmt = NULL;
 338                   stmt_vec_info pattern_def_stmt_info = NULL;
 339
 340                   while (!gsi_end_p (pattern_def_si))
 341                     {
 342                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 343                       pattern_def_stmt_info
 344                         = vinfo_for_stmt (pattern_def_stmt);
 345                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 346                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 347                         break;
 348                       gsi_next (&pattern_def_si);
 349                     }
 350
 351                   if (!gsi_end_p (pattern_def_si))
 352                     {
 353                       if (dump_enabled_p ())
 354                         {
 355                           dump_printf_loc (MSG_NOTE, vect_location,
 356                                            "==> examining pattern def stmt: ");
 357                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 358                                             pattern_def_stmt, 0);
 359                         }
 360
 361                       stmt = pattern_def_stmt;
 362                       stmt_info = pattern_def_stmt_info;
 363                     }
 364                   else
 365                     {
 366                       pattern_def_si = gsi_none ();
 367                       analyze_pattern_stmt = false;
 368                     }
 369                 }
 370               else
 371                 analyze_pattern_stmt = false;
 372             }
 373
 374           if (gimple_get_lhs (stmt) == NULL_TREE
 375               /* MASK_STORE has no lhs, but is ok.  */
 376               && (!is_gimple_call (stmt)
 377                   || !gimple_call_internal_p (stmt)
 378                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 379             {
 380               if (is_gimple_call (stmt))
 381                 {
 382                   /* Ignore calls with no lhs.  These must be calls to
 383                      #pragma omp simd functions, and what vectorization factor
 384                      it really needs can't be determined until
 385                      vectorizable_simd_clone_call.  */
 386                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 387                     {
 388                       pattern_def_seq = NULL;
 389                       gsi_next (&si);
 390                     }
 391                   continue;
 392                 }
 393               if (dump_enabled_p ())
 394                 {
 395                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 396                                    "not vectorized: irregular stmt.");
 397                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 398                                     0);
 399                 }
 400               return false;
 401             }
 402
 403           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 404             {
 405               if (dump_enabled_p ())
 406                 {
 407                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                                    "not vectorized: vector stmt in loop:");
 409                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 431                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 432               else
 433                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 434
 435               /* Bool ops don't participate in vectorization factor
 436                  computation.  For comparison use compared types to
 437                  compute a factor.  */
 438               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 439                   && is_gimple_assign (stmt)
 440                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 441                 {
 442                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 443                       || STMT_VINFO_LIVE_P (stmt_info))
 444                     mask_producers.safe_push (stmt_info);
 445                   bool_result = true;
 446
 447                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 448                       == tcc_comparison
 449                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 450                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 451                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 452                   else
 453                     {
 454                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 455                         {
 456                           pattern_def_seq = NULL;
 457                           gsi_next (&si);
 458                         }
 459                       continue;
 460                     }
 461                 }
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location,
 466                                    "get vectype for scalar type:  ");
 467                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 468                   dump_printf (MSG_NOTE, "\n");
 469                 }
 470               vectype = get_vectype_for_scalar_type (scalar_type);
 471               if (!vectype)
 472                 {
 473                   if (dump_enabled_p ())
 474                     {
 475                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 476                                        "not vectorized: unsupported "
 477                                        "data-type ");
 478                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 479                                          scalar_type);
 480                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 481                     }
 482                   return false;
 483                 }
 484
 485               if (!bool_result)
 486                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 487
 488               if (dump_enabled_p ())
 489                 {
 490                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 491                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 492                   dump_printf (MSG_NOTE, "\n");
 493                 }
 494             }
 495
 496           /* Don't try to compute VF out scalar types if we stmt
 497              produces boolean vector.  Use result vectype instead.  */
 498           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 499             vf_vectype = vectype;
 500           else
 501             {
 502               /* The vectorization factor is according to the smallest
 503                  scalar type (or the largest vector size, but we only
 504                  support one vector size per loop).  */
 505               if (!bool_result)
 506                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                              &dummy);
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_NOTE, vect_location,
 511                                    "get vectype for scalar type:  ");
 512                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513                   dump_printf (MSG_NOTE, "\n");
 514                 }
 515               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516             }
 517           if (!vf_vectype)
 518             {
 519               if (dump_enabled_p ())
 520                 {
 521                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 522                                    "not vectorized: unsupported data-type ");
 523                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 524                                      scalar_type);
 525                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 526                 }
 527               return false;
 528             }
 529
 530           if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
 531                         GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 532             {
 533               if (dump_enabled_p ())
 534                 {
 535                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                                    "not vectorized: different sized vector "
 537                                    "types in statement, ");
 538                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 539                                      vectype);
 540                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 541                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 542                                      vf_vectype);
 543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 544                 }
 545               return false;
 546             }
 547
 548           if (dump_enabled_p ())
 549             {
 550               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 551               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 552               dump_printf (MSG_NOTE, "\n");
 553             }
 554
 555           if (dump_enabled_p ())
 556             {
 557               dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 558               dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
 559               dump_printf (MSG_NOTE, "\n");
 560             }
 561
 562           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 563
 564           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 565             {
 566               pattern_def_seq = NULL;
 567               gsi_next (&si);
 568             }
 569         }
 570     }
 571
 572   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 573   if (dump_enabled_p ())
 574     {
 575       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 576       dump_dec (MSG_NOTE, vectorization_factor);
 577       dump_printf (MSG_NOTE, "\n");
 578     }
 579
 580   if (known_le (vectorization_factor, 1U))
 581     {
 582       if (dump_enabled_p ())
 583         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 584                          "not vectorized: unsupported data-type\n");
 585       return false;
 586     }
 587   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 588
 589   for (i = 0; i < mask_producers.length (); i++)
 590     {
 591       tree mask_type = NULL;
 592
 593       stmt = STMT_VINFO_STMT (mask_producers[i]);
 594
 595       if (is_gimple_assign (stmt)
 596           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 597           && !VECT_SCALAR_BOOLEAN_TYPE_P
 598                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 599         {
 600           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 601           mask_type = get_mask_type_for_scalar_type (scalar_type);
 602
 603           if (!mask_type)
 604             {
 605               if (dump_enabled_p ())
 606                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                                  "not vectorized: unsupported mask\n");
 608               return false;
 609             }
 610         }
 611       else
 612         {
 613           tree rhs;
 614           ssa_op_iter iter;
 615           gimple *def_stmt;
 616           enum vect_def_type dt;
 617
 618           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 619             {
 620               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 621                                        &def_stmt, &dt, &vectype))
 622                 {
 623                   if (dump_enabled_p ())
 624                     {
 625                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 626                                        "not vectorized: can't compute mask type "
 627                                        "for statement, ");
 628                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 629                                         0);
 630                     }
 631                   return false;
 632                 }
 633
 634               /* No vectype probably means external definition.
 635                  Allow it in case there is another operand which
 636                  allows to determine mask type.  */
 637               if (!vectype)
 638                 continue;
 639
 640               if (!mask_type)
 641                 mask_type = vectype;
 642               else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
 643                                  TYPE_VECTOR_SUBPARTS (vectype)))
 644                 {
 645                   if (dump_enabled_p ())
 646                     {
 647                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 648                                        "not vectorized: different sized masks "
 649                                        "types in statement, ");
 650                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 651                                          mask_type);
 652                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 653                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 654                                          vectype);
 655                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 656                     }
 657                   return false;
 658                 }
 659               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 660                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 661                 {
 662                   if (dump_enabled_p ())
 663                     {
 664                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 665                                        "not vectorized: mixed mask and "
 666                                        "nonmask vector types in statement, ");
 667                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 668                                          mask_type);
 669                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 670                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 671                                          vectype);
 672                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 673                     }
 674                   return false;
 675                 }
 676             }
 677
 678           /* We may compare boolean value loaded as vector of integers.
 679              Fix mask_type in such case.  */
 680           if (mask_type
 681               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 682               && gimple_code (stmt) == GIMPLE_ASSIGN
 683               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 684             mask_type = build_same_sized_truth_vector_type (mask_type);
 685         }
 686
 687       /* No mask_type should mean loop invariant predicate.
 688          This is probably a subject for optimization in
 689          if-conversion.  */
 690       if (!mask_type)
 691         {
 692           if (dump_enabled_p ())
 693             {
 694               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 695                                "not vectorized: can't compute mask type "
 696                                "for statement, ");
 697               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 698                                 0);
 699             }
 700           return false;
 701         }
 702
 703       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 704     }
 705
 706   return true;
 707 }
 708
 709
 710 /* Function vect_is_simple_iv_evolution.
 711
 712    FORNOW: A simple evolution of an induction variables in the loop is
 713    considered a polynomial evolution.  */
 714
 715 static bool
 716 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 717                              tree * step)
 718 {
 719   tree init_expr;
 720   tree step_expr;
 721   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 722   basic_block bb;
 723
 724   /* When there is no evolution in this loop, the evolution function
 725      is not "simple".  */
 726   if (evolution_part == NULL_TREE)
 727     return false;
 728
 729   /* When the evolution is a polynomial of degree >= 2
 730      the evolution function is not "simple".  */
 731   if (tree_is_chrec (evolution_part))
 732     return false;
 733
 734   step_expr = evolution_part;
 735   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 736
 737   if (dump_enabled_p ())
 738     {
 739       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 740       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 741       dump_printf (MSG_NOTE, ",  init: ");
 742       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 743       dump_printf (MSG_NOTE, "\n");
 744     }
 745
 746   *init = init_expr;
 747   *step = step_expr;
 748
 749   if (TREE_CODE (step_expr) != INTEGER_CST
 750       && (TREE_CODE (step_expr) != SSA_NAME
 751           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 752               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 753           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 754               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 755                   || !flag_associative_math)))
 756       && (TREE_CODE (step_expr) != REAL_CST
 757           || !flag_associative_math))
 758     {
 759       if (dump_enabled_p ())
 760         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 761                          "step unknown.\n");
 762       return false;
 763     }
 764
 765   return true;
 766 }
 767
 768 /* Function vect_analyze_scalar_cycles_1.
 769
 770    Examine the cross iteration def-use cycles of scalar variables
 771    in LOOP.  LOOP_VINFO represents the loop that is now being
 772    considered for vectorization (can be LOOP, or an outer-loop
 773    enclosing LOOP).  */
 774
 775 static void
 776 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 777 {
 778   basic_block bb = loop->header;
 779   tree init, step;
 780   auto_vec<gimple *, 64> worklist;
 781   gphi_iterator gsi;
 782   bool double_reduc;
 783
 784   if (dump_enabled_p ())
 785     dump_printf_loc (MSG_NOTE, vect_location,
 786                      "=== vect_analyze_scalar_cycles ===\n");
 787
 788   /* First - identify all inductions.  Reduction detection assumes that all the
 789      inductions have been identified, therefore, this order must not be
 790      changed.  */
 791   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 792     {
 793       gphi *phi = gsi.phi ();
 794       tree access_fn = NULL;
 795       tree def = PHI_RESULT (phi);
 796       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 797
 798       if (dump_enabled_p ())
 799         {
 800           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 801           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 802         }
 803
 804       /* Skip virtual phi's.  The data dependences that are associated with
 805          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 806       if (virtual_operand_p (def))
 807         continue;
 808
 809       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 810
 811       /* Analyze the evolution function.  */
 812       access_fn = analyze_scalar_evolution (loop, def);
 813       if (access_fn)
 814         {
 815           STRIP_NOPS (access_fn);
 816           if (dump_enabled_p ())
 817             {
 818               dump_printf_loc (MSG_NOTE, vect_location,
 819                                "Access function of PHI: ");
 820               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 821               dump_printf (MSG_NOTE, "\n");
 822             }
 823           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 824             = initial_condition_in_loop_num (access_fn, loop->num);
 825           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 826             = evolution_part_in_loop_num (access_fn, loop->num);
 827         }
 828
 829       if (!access_fn
 830           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 831           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 832               && TREE_CODE (step) != INTEGER_CST))
 833         {
 834           worklist.safe_push (phi);
 835           continue;
 836         }
 837
 838       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 839                   != NULL_TREE);
 840       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 841
 842       if (dump_enabled_p ())
 843         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 844       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 845     }
 846
 847
 848   /* Second - identify all reductions and nested cycles.  */
 849   while (worklist.length () > 0)
 850     {
 851       gimple *phi = worklist.pop ();
 852       tree def = PHI_RESULT (phi);
 853       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 854       gimple *reduc_stmt;
 855
 856       if (dump_enabled_p ())
 857         {
 858           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 859           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 860         }
 861
 862       gcc_assert (!virtual_operand_p (def)
 863                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 864
 865       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 866                                                 &double_reduc, false);
 867       if (reduc_stmt)
 868         {
 869           if (double_reduc)
 870             {
 871               if (dump_enabled_p ())
 872                 dump_printf_loc (MSG_NOTE, vect_location,
 873                                  "Detected double reduction.\n");
 874
 875               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 876               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 877                                                     vect_double_reduction_def;
 878             }
 879           else
 880             {
 881               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 882                 {
 883                   if (dump_enabled_p ())
 884                     dump_printf_loc (MSG_NOTE, vect_location,
 885                                      "Detected vectorizable nested cycle.\n");
 886
 887                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 888                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 889                                                              vect_nested_cycle;
 890                 }
 891               else
 892                 {
 893                   if (dump_enabled_p ())
 894                     dump_printf_loc (MSG_NOTE, vect_location,
 895                                      "Detected reduction.\n");
 896
 897                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 898                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 899                                                            vect_reduction_def;
 900                   /* Store the reduction cycles for possible vectorization in
 901                      loop-aware SLP if it was not detected as reduction
 902                      chain.  */
 903                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 904                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 905                 }
 906             }
 907         }
 908       else
 909         if (dump_enabled_p ())
 910           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 911                            "Unknown def-use cycle pattern.\n");
 912     }
 913 }
 914
 915
 916 /* Function vect_analyze_scalar_cycles.
 917
 918    Examine the cross iteration def-use cycles of scalar variables, by
 919    analyzing the loop-header PHIs of scalar variables.  Classify each
 920    cycle as one of the following: invariant, induction, reduction, unknown.
 921    We do that for the loop represented by LOOP_VINFO, and also to its
 922    inner-loop, if exists.
 923    Examples for scalar cycles:
 924
 925    Example1: reduction:
 926
 927               loop1:
 928               for (i=0; i<N; i++)
 929                  sum += a[i];
 930
 931    Example2: induction:
 932
 933               loop2:
 934               for (i=0; i<N; i++)
 935                  a[i] = i;  */
 936
 937 static void
 938 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 939 {
 940   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 941
 942   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 943
 944   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 945      Reductions in such inner-loop therefore have different properties than
 946      the reductions in the nest that gets vectorized:
 947      1. When vectorized, they are executed in the same order as in the original
 948         scalar loop, so we can't change the order of computation when
 949         vectorizing them.
 950      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 951         current checks are too strict.  */
 952
 953   if (loop->inner)
 954     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 955 }
 956
 957 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 958
 959 static void
 960 vect_fixup_reduc_chain (gimple *stmt)
 961 {
 962   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 963   gimple *stmtp;
 964   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 965               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 966   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 967   do
 968     {
 969       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 970       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 971       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 972       if (stmt)
 973         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 974           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 975     }
 976   while (stmt);
 977   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 978 }
 979
 980 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 981
 982 static void
 983 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 984 {
 985   gimple *first;
 986   unsigned i;
 987
 988   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 989     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 990       {
 991         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 992         while (next)
 993           {
 994             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 995               break;
 996             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 997           }
 998         /* If not all stmt in the chain are patterns try to handle
 999            the chain without patterns.  */
1000         if (! next)
1001           {
1002             vect_fixup_reduc_chain (first);
1003             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1004               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1005           }
1006       }
1007 }
1008
1009 /* Function vect_get_loop_niters.
1010
1011    Determine how many iterations the loop is executed and place it
1012    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1013    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1014    niter information holds in ASSUMPTIONS.
1015
1016    Return the loop exit condition.  */
1017
1018
1019 static gcond *
1020 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1021                       tree *number_of_iterations, tree *number_of_iterationsm1)
1022 {
1023   edge exit = single_exit (loop);
1024   struct tree_niter_desc niter_desc;
1025   tree niter_assumptions, niter, may_be_zero;
1026   gcond *cond = get_loop_exit_condition (loop);
1027
1028   *assumptions = boolean_true_node;
1029   *number_of_iterationsm1 = chrec_dont_know;
1030   *number_of_iterations = chrec_dont_know;
1031   if (dump_enabled_p ())
1032     dump_printf_loc (MSG_NOTE, vect_location,
1033                      "=== get_loop_niters ===\n");
1034
1035   if (!exit)
1036     return cond;
1037
1038   niter = chrec_dont_know;
1039   may_be_zero = NULL_TREE;
1040   niter_assumptions = boolean_true_node;
1041   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1042       || chrec_contains_undetermined (niter_desc.niter))
1043     return cond;
1044
1045   niter_assumptions = niter_desc.assumptions;
1046   may_be_zero = niter_desc.may_be_zero;
1047   niter = niter_desc.niter;
1048
1049   if (may_be_zero && integer_zerop (may_be_zero))
1050     may_be_zero = NULL_TREE;
1051
1052   if (may_be_zero)
1053     {
1054       if (COMPARISON_CLASS_P (may_be_zero))
1055         {
1056           /* Try to combine may_be_zero with assumptions, this can simplify
1057              computation of niter expression.  */
1058           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1059             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1060                                              niter_assumptions,
1061                                              fold_build1 (TRUTH_NOT_EXPR,
1062                                                           boolean_type_node,
1063                                                           may_be_zero));
1064           else
1065             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1066                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1067
1068           may_be_zero = NULL_TREE;
1069         }
1070       else if (integer_nonzerop (may_be_zero))
1071         {
1072           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1073           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1074           return cond;
1075         }
1076       else
1077         return cond;
1078     }
1079
1080   *assumptions = niter_assumptions;
1081   *number_of_iterationsm1 = niter;
1082
1083   /* We want the number of loop header executions which is the number
1084      of latch executions plus one.
1085      ???  For UINT_MAX latch executions this number overflows to zero
1086      for loops like do { n++; } while (n != 0);  */
1087   if (niter && !chrec_contains_undetermined (niter))
1088     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1089                           build_int_cst (TREE_TYPE (niter), 1));
1090   *number_of_iterations = niter;
1091
1092   return cond;
1093 }
1094
1095 /* Function bb_in_loop_p
1096
1097    Used as predicate for dfs order traversal of the loop bbs.  */
1098
1099 static bool
1100 bb_in_loop_p (const_basic_block bb, const void *data)
1101 {
1102   const struct loop *const loop = (const struct loop *)data;
1103   if (flow_bb_inside_loop_p (loop, bb))
1104     return true;
1105   return false;
1106 }
1107
1108
1109 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1110    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1111
1112 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1113   : vec_info (vec_info::loop, init_cost (loop_in)),
1114     loop (loop_in),
1115     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1116     num_itersm1 (NULL_TREE),
1117     num_iters (NULL_TREE),
1118     num_iters_unchanged (NULL_TREE),
1119     num_iters_assumptions (NULL_TREE),
1120     th (0),
1121     versioning_threshold (0),
1122     vectorization_factor (0),
1123     max_vectorization_factor (0),
1124     unaligned_dr (NULL),
1125     peeling_for_alignment (0),
1126     ptr_mask (0),
1127     slp_unrolling_factor (1),
1128     single_scalar_iteration_cost (0),
1129     vectorizable (false),
1130     peeling_for_gaps (false),
1131     peeling_for_niter (false),
1132     operands_swapped (false),
1133     no_data_dependencies (false),
1134     has_mask_store (false),
1135     scalar_loop (NULL),
1136     orig_loop_info (NULL)
1137 {
1138   /* Create/Update stmt_info for all stmts in the loop.  */
1139   basic_block *body = get_loop_body (loop);
1140   for (unsigned int i = 0; i < loop->num_nodes; i++)
1141     {
1142       basic_block bb = body[i];
1143       gimple_stmt_iterator si;
1144
1145       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1146         {
1147           gimple *phi = gsi_stmt (si);
1148           gimple_set_uid (phi, 0);
1149           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1150         }
1151
1152       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1153         {
1154           gimple *stmt = gsi_stmt (si);
1155           gimple_set_uid (stmt, 0);
1156           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1157         }
1158     }
1159   free (body);
1160
1161   /* CHECKME: We want to visit all BBs before their successors (except for
1162      latch blocks, for which this assertion wouldn't hold).  In the simple
1163      case of the loop forms we allow, a dfs order of the BBs would the same
1164      as reversed postorder traversal, so we are safe.  */
1165
1166   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1167                                           bbs, loop->num_nodes, loop);
1168   gcc_assert (nbbs == loop->num_nodes);
1169 }
1170
1171
1172 /* Free all memory used by the _loop_vec_info, as well as all the
1173    stmt_vec_info structs of all the stmts in the loop.  */
1174
1175 _loop_vec_info::~_loop_vec_info ()
1176 {
1177   int nbbs;
1178   gimple_stmt_iterator si;
1179   int j;
1180
1181   nbbs = loop->num_nodes;
1182   for (j = 0; j < nbbs; j++)
1183     {
1184       basic_block bb = bbs[j];
1185       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1186         free_stmt_vec_info (gsi_stmt (si));
1187
1188       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1189         {
1190           gimple *stmt = gsi_stmt (si);
1191
1192           /* We may have broken canonical form by moving a constant
1193              into RHS1 of a commutative op.  Fix such occurrences.  */
1194           if (operands_swapped && is_gimple_assign (stmt))
1195             {
1196               enum tree_code code = gimple_assign_rhs_code (stmt);
1197
1198               if ((code == PLUS_EXPR
1199                    || code == POINTER_PLUS_EXPR
1200                    || code == MULT_EXPR)
1201                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1202                 swap_ssa_operands (stmt,
1203                                    gimple_assign_rhs1_ptr (stmt),
1204                                    gimple_assign_rhs2_ptr (stmt));
1205               else if (code == COND_EXPR
1206                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1207                 {
1208                   tree cond_expr = gimple_assign_rhs1 (stmt);
1209                   enum tree_code cond_code = TREE_CODE (cond_expr);
1210
1211                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1212                     {
1213                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1214                                                                   0));
1215                       cond_code = invert_tree_comparison (cond_code,
1216                                                           honor_nans);
1217                       if (cond_code != ERROR_MARK)
1218                         {
1219                           TREE_SET_CODE (cond_expr, cond_code);
1220                           swap_ssa_operands (stmt,
1221                                              gimple_assign_rhs2_ptr (stmt),
1222                                              gimple_assign_rhs3_ptr (stmt));
1223                         }
1224                     }
1225                 }
1226             }
1227
1228           /* Free stmt_vec_info.  */
1229           free_stmt_vec_info (stmt);
1230           gsi_next (&si);
1231         }
1232     }
1233
1234   free (bbs);
1235
1236   loop->aux = NULL;
1237 }
1238
1239
1240 /* Calculate the cost of one scalar iteration of the loop.  */
1241 static void
1242 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1243 {
1244   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1245   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1246   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1247   int innerloop_iters, i;
1248
1249   /* Count statements in scalar loop.  Using this as scalar cost for a single
1250      iteration for now.
1251
1252      TODO: Add outer loop support.
1253
1254      TODO: Consider assigning different costs to different scalar
1255      statements.  */
1256
1257   /* FORNOW.  */
1258   innerloop_iters = 1;
1259   if (loop->inner)
1260     innerloop_iters = 50; /* FIXME */
1261
1262   for (i = 0; i < nbbs; i++)
1263     {
1264       gimple_stmt_iterator si;
1265       basic_block bb = bbs[i];
1266
1267       if (bb->loop_father == loop->inner)
1268         factor = innerloop_iters;
1269       else
1270         factor = 1;
1271
1272       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1273         {
1274           gimple *stmt = gsi_stmt (si);
1275           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1276
1277           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1278             continue;
1279
1280           /* Skip stmts that are not vectorized inside the loop.  */
1281           if (stmt_info
1282               && !STMT_VINFO_RELEVANT_P (stmt_info)
1283               && (!STMT_VINFO_LIVE_P (stmt_info)
1284                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1285               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1286             continue;
1287
1288           vect_cost_for_stmt kind;
1289           if (STMT_VINFO_DATA_REF (stmt_info))
1290             {
1291               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1292                kind = scalar_load;
1293              else
1294                kind = scalar_store;
1295             }
1296           else
1297             kind = scalar_stmt;
1298
1299           scalar_single_iter_cost
1300             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1301                                  factor, kind, stmt_info, 0, vect_prologue);
1302         }
1303     }
1304   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1305     = scalar_single_iter_cost;
1306 }
1307
1308
1309 /* Function vect_analyze_loop_form_1.
1310
1311    Verify that certain CFG restrictions hold, including:
1312    - the loop has a pre-header
1313    - the loop has a single entry and exit
1314    - the loop exit condition is simple enough
1315    - the number of iterations can be analyzed, i.e, a countable loop.  The
1316      niter could be analyzed under some assumptions.  */
1317
1318 bool
1319 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1320                           tree *assumptions, tree *number_of_iterationsm1,
1321                           tree *number_of_iterations, gcond **inner_loop_cond)
1322 {
1323   if (dump_enabled_p ())
1324     dump_printf_loc (MSG_NOTE, vect_location,
1325                      "=== vect_analyze_loop_form ===\n");
1326
1327   /* Different restrictions apply when we are considering an inner-most loop,
1328      vs. an outer (nested) loop.
1329      (FORNOW. May want to relax some of these restrictions in the future).  */
1330
1331   if (!loop->inner)
1332     {
1333       /* Inner-most loop.  We currently require that the number of BBs is
1334          exactly 2 (the header and latch).  Vectorizable inner-most loops
1335          look like this:
1336
1337                         (pre-header)
1338                            |
1339                           header <--------+
1340                            | |            |
1341                            | +--> latch --+
1342                            |
1343                         (exit-bb)  */
1344
1345       if (loop->num_nodes != 2)
1346         {
1347           if (dump_enabled_p ())
1348             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1349                              "not vectorized: control flow in loop.\n");
1350           return false;
1351         }
1352
1353       if (empty_block_p (loop->header))
1354         {
1355           if (dump_enabled_p ())
1356             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1357                              "not vectorized: empty loop.\n");
1358           return false;
1359         }
1360     }
1361   else
1362     {
1363       struct loop *innerloop = loop->inner;
1364       edge entryedge;
1365
1366       /* Nested loop. We currently require that the loop is doubly-nested,
1367          contains a single inner loop, and the number of BBs is exactly 5.
1368          Vectorizable outer-loops look like this:
1369
1370                         (pre-header)
1371                            |
1372                           header <---+
1373                            |         |
1374                           inner-loop |
1375                            |         |
1376                           tail ------+
1377                            |
1378                         (exit-bb)
1379
1380          The inner-loop has the properties expected of inner-most loops
1381          as described above.  */
1382
1383       if ((loop->inner)->inner || (loop->inner)->next)
1384         {
1385           if (dump_enabled_p ())
1386             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387                              "not vectorized: multiple nested loops.\n");
1388           return false;
1389         }
1390
1391       if (loop->num_nodes != 5)
1392         {
1393           if (dump_enabled_p ())
1394             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1395                              "not vectorized: control flow in loop.\n");
1396           return false;
1397         }
1398
1399       entryedge = loop_preheader_edge (innerloop);
1400       if (entryedge->src != loop->header
1401           || !single_exit (innerloop)
1402           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1403         {
1404           if (dump_enabled_p ())
1405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1406                              "not vectorized: unsupported outerloop form.\n");
1407           return false;
1408         }
1409
1410       /* Analyze the inner-loop.  */
1411       tree inner_niterm1, inner_niter, inner_assumptions;
1412       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1413                                       &inner_assumptions, &inner_niterm1,
1414                                       &inner_niter, NULL)
1415           /* Don't support analyzing niter under assumptions for inner
1416              loop.  */
1417           || !integer_onep (inner_assumptions))
1418         {
1419           if (dump_enabled_p ())
1420             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421                              "not vectorized: Bad inner loop.\n");
1422           return false;
1423         }
1424
1425       if (!expr_invariant_in_loop_p (loop, inner_niter))
1426         {
1427           if (dump_enabled_p ())
1428             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1429                              "not vectorized: inner-loop count not"
1430                              " invariant.\n");
1431           return false;
1432         }
1433
1434       if (dump_enabled_p ())
1435         dump_printf_loc (MSG_NOTE, vect_location,
1436                          "Considering outer-loop vectorization.\n");
1437     }
1438
1439   if (!single_exit (loop)
1440       || EDGE_COUNT (loop->header->preds) != 2)
1441     {
1442       if (dump_enabled_p ())
1443         {
1444           if (!single_exit (loop))
1445             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1446                              "not vectorized: multiple exits.\n");
1447           else if (EDGE_COUNT (loop->header->preds) != 2)
1448             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1449                              "not vectorized: too many incoming edges.\n");
1450         }
1451       return false;
1452     }
1453
1454   /* We assume that the loop exit condition is at the end of the loop. i.e,
1455      that the loop is represented as a do-while (with a proper if-guard
1456      before the loop if needed), where the loop header contains all the
1457      executable statements, and the latch is empty.  */
1458   if (!empty_block_p (loop->latch)
1459       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1460     {
1461       if (dump_enabled_p ())
1462         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1463                          "not vectorized: latch block not empty.\n");
1464       return false;
1465     }
1466
1467   /* Make sure the exit is not abnormal.  */
1468   edge e = single_exit (loop);
1469   if (e->flags & EDGE_ABNORMAL)
1470     {
1471       if (dump_enabled_p ())
1472         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1473                          "not vectorized: abnormal loop exit edge.\n");
1474       return false;
1475     }
1476
1477   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1478                                      number_of_iterationsm1);
1479   if (!*loop_cond)
1480     {
1481       if (dump_enabled_p ())
1482         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1483                          "not vectorized: complicated exit condition.\n");
1484       return false;
1485     }
1486
1487   if (integer_zerop (*assumptions)
1488       || !*number_of_iterations
1489       || chrec_contains_undetermined (*number_of_iterations))
1490     {
1491       if (dump_enabled_p ())
1492         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493                          "not vectorized: number of iterations cannot be "
1494                          "computed.\n");
1495       return false;
1496     }
1497
1498   if (integer_zerop (*number_of_iterations))
1499     {
1500       if (dump_enabled_p ())
1501         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1502                          "not vectorized: number of iterations = 0.\n");
1503       return false;
1504     }
1505
1506   return true;
1507 }
1508
1509 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1510
1511 loop_vec_info
1512 vect_analyze_loop_form (struct loop *loop)
1513 {
1514   tree assumptions, number_of_iterations, number_of_iterationsm1;
1515   gcond *loop_cond, *inner_loop_cond = NULL;
1516
1517   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1518                                   &assumptions, &number_of_iterationsm1,
1519                                   &number_of_iterations, &inner_loop_cond))
1520     return NULL;
1521
1522   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1523   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1524   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1525   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1526   if (!integer_onep (assumptions))
1527     {
1528       /* We consider to vectorize this loop by versioning it under
1529          some assumptions.  In order to do this, we need to clear
1530          existing information computed by scev and niter analyzer.  */
1531       scev_reset_htab ();
1532       free_numbers_of_iterations_estimates (loop);
1533       /* Also set flag for this loop so that following scev and niter
1534          analysis are done under the assumptions.  */
1535       loop_constraint_set (loop, LOOP_C_FINITE);
1536       /* Also record the assumptions for versioning.  */
1537       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1538     }
1539
1540   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1541     {
1542       if (dump_enabled_p ())
1543         {
1544           dump_printf_loc (MSG_NOTE, vect_location,
1545                            "Symbolic number of iterations is ");
1546           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1547           dump_printf (MSG_NOTE, "\n");
1548         }
1549     }
1550
1551   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1552   if (inner_loop_cond)
1553     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1554       = loop_exit_ctrl_vec_info_type;
1555
1556   gcc_assert (!loop->aux);
1557   loop->aux = loop_vinfo;
1558   return loop_vinfo;
1559 }
1560
1561
1562
1563 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1564    statements update the vectorization factor.  */
1565
1566 static void
1567 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1568 {
1569   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1570   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1571   int nbbs = loop->num_nodes;
1572   poly_uint64 vectorization_factor;
1573   int i;
1574
1575   if (dump_enabled_p ())
1576     dump_printf_loc (MSG_NOTE, vect_location,
1577                      "=== vect_update_vf_for_slp ===\n");
1578
1579   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1580   gcc_assert (known_ne (vectorization_factor, 0U));
1581
1582   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1583      vectorization factor of the loop is the unrolling factor required by
1584      the SLP instances.  If that unrolling factor is 1, we say, that we
1585      perform pure SLP on loop - cross iteration parallelism is not
1586      exploited.  */
1587   bool only_slp_in_loop = true;
1588   for (i = 0; i < nbbs; i++)
1589     {
1590       basic_block bb = bbs[i];
1591       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1592            gsi_next (&si))
1593         {
1594           gimple *stmt = gsi_stmt (si);
1595           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1596           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1597               && STMT_VINFO_RELATED_STMT (stmt_info))
1598             {
1599               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1600               stmt_info = vinfo_for_stmt (stmt);
1601             }
1602           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1603                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1604               && !PURE_SLP_STMT (stmt_info))
1605             /* STMT needs both SLP and loop-based vectorization.  */
1606             only_slp_in_loop = false;
1607         }
1608     }
1609
1610   if (only_slp_in_loop)
1611     {
1612       dump_printf_loc (MSG_NOTE, vect_location,
1613                        "Loop contains only SLP stmts\n");
1614       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1615     }
1616   else
1617     {
1618       dump_printf_loc (MSG_NOTE, vect_location,
1619                        "Loop contains SLP and non-SLP stmts\n");
1620       /* Both the vectorization factor and unroll factor have the form
1621          current_vector_size * X for some rational X, so they must have
1622          a common multiple.  */
1623       vectorization_factor
1624         = force_common_multiple (vectorization_factor,
1625                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1626     }
1627
1628   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1629   if (dump_enabled_p ())
1630     {
1631       dump_printf_loc (MSG_NOTE, vect_location,
1632                        "Updating vectorization factor to ");
1633       dump_dec (MSG_NOTE, vectorization_factor);
1634       dump_printf (MSG_NOTE, ".\n");
1635     }
1636 }
1637
1638 /* Function vect_analyze_loop_operations.
1639
1640    Scan the loop stmts and make sure they are all vectorizable.  */
1641
1642 static bool
1643 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1644 {
1645   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1646   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1647   int nbbs = loop->num_nodes;
1648   int i;
1649   stmt_vec_info stmt_info;
1650   bool need_to_vectorize = false;
1651   bool ok;
1652
1653   if (dump_enabled_p ())
1654     dump_printf_loc (MSG_NOTE, vect_location,
1655                      "=== vect_analyze_loop_operations ===\n");
1656
1657   for (i = 0; i < nbbs; i++)
1658     {
1659       basic_block bb = bbs[i];
1660
1661       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1662            gsi_next (&si))
1663         {
1664           gphi *phi = si.phi ();
1665           ok = true;
1666
1667           stmt_info = vinfo_for_stmt (phi);
1668           if (dump_enabled_p ())
1669             {
1670               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1671               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1672             }
1673           if (virtual_operand_p (gimple_phi_result (phi)))
1674             continue;
1675
1676           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1677              (i.e., a phi in the tail of the outer-loop).  */
1678           if (! is_loop_header_bb_p (bb))
1679             {
1680               /* FORNOW: we currently don't support the case that these phis
1681                  are not used in the outerloop (unless it is double reduction,
1682                  i.e., this phi is vect_reduction_def), cause this case
1683                  requires to actually do something here.  */
1684               if (STMT_VINFO_LIVE_P (stmt_info)
1685                   && STMT_VINFO_DEF_TYPE (stmt_info)
1686                      != vect_double_reduction_def)
1687                 {
1688                   if (dump_enabled_p ())
1689                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1690                                      "Unsupported loop-closed phi in "
1691                                      "outer-loop.\n");
1692                   return false;
1693                 }
1694
1695               /* If PHI is used in the outer loop, we check that its operand
1696                  is defined in the inner loop.  */
1697               if (STMT_VINFO_RELEVANT_P (stmt_info))
1698                 {
1699                   tree phi_op;
1700                   gimple *op_def_stmt;
1701
1702                   if (gimple_phi_num_args (phi) != 1)
1703                     return false;
1704
1705                   phi_op = PHI_ARG_DEF (phi, 0);
1706                   if (TREE_CODE (phi_op) != SSA_NAME)
1707                     return false;
1708
1709                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1710                   if (gimple_nop_p (op_def_stmt)
1711                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1712                       || !vinfo_for_stmt (op_def_stmt))
1713                     return false;
1714
1715                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1716                         != vect_used_in_outer
1717                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1718                            != vect_used_in_outer_by_reduction)
1719                     return false;
1720                 }
1721
1722               continue;
1723             }
1724
1725           gcc_assert (stmt_info);
1726
1727           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1728                || STMT_VINFO_LIVE_P (stmt_info))
1729               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1730             {
1731               /* A scalar-dependence cycle that we don't support.  */
1732               if (dump_enabled_p ())
1733                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734                                  "not vectorized: scalar dependence cycle.\n");
1735               return false;
1736             }
1737
1738           if (STMT_VINFO_RELEVANT_P (stmt_info))
1739             {
1740               need_to_vectorize = true;
1741               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1742                   && ! PURE_SLP_STMT (stmt_info))
1743                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1744               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1745                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1746                        && ! PURE_SLP_STMT (stmt_info))
1747                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1748             }
1749
1750           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1751             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1752
1753           if (!ok)
1754             {
1755               if (dump_enabled_p ())
1756                 {
1757                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1758                                    "not vectorized: relevant phi not "
1759                                    "supported: ");
1760                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1761                 }
1762               return false;
1763             }
1764         }
1765
1766       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1767            gsi_next (&si))
1768         {
1769           gimple *stmt = gsi_stmt (si);
1770           if (!gimple_clobber_p (stmt)
1771               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1772             return false;
1773         }
1774     } /* bbs */
1775
1776   /* All operations in the loop are either irrelevant (deal with loop
1777      control, or dead), or only used outside the loop and can be moved
1778      out of the loop (e.g. invariants, inductions).  The loop can be
1779      optimized away by scalar optimizations.  We're better off not
1780      touching this loop.  */
1781   if (!need_to_vectorize)
1782     {
1783       if (dump_enabled_p ())
1784         dump_printf_loc (MSG_NOTE, vect_location,
1785                          "All the computation can be taken out of the loop.\n");
1786       if (dump_enabled_p ())
1787         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1788                          "not vectorized: redundant loop. no profit to "
1789                          "vectorize.\n");
1790       return false;
1791     }
1792
1793   return true;
1794 }
1795
1796
1797 /* Function vect_analyze_loop_2.
1798
1799    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1800    for it.  The different analyses will record information in the
1801    loop_vec_info struct.  */
1802 static bool
1803 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1804 {
1805   bool ok;
1806   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1807   poly_uint64 min_vf = 2;
1808   unsigned int n_stmts = 0;
1809
1810   /* The first group of checks is independent of the vector size.  */
1811   fatal = true;
1812
1813   /* Find all data references in the loop (which correspond to vdefs/vuses)
1814      and analyze their evolution in the loop.  */
1815
1816   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1817
1818   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1819   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1820     {
1821       if (dump_enabled_p ())
1822         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1823                          "not vectorized: loop nest containing two "
1824                          "or more consecutive inner loops cannot be "
1825                          "vectorized\n");
1826       return false;
1827     }
1828
1829   for (unsigned i = 0; i < loop->num_nodes; i++)
1830     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1831          !gsi_end_p (gsi); gsi_next (&gsi))
1832       {
1833         gimple *stmt = gsi_stmt (gsi);
1834         if (is_gimple_debug (stmt))
1835           continue;
1836         ++n_stmts;
1837         if (!find_data_references_in_stmt (loop, stmt,
1838                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1839           {
1840             if (is_gimple_call (stmt) && loop->safelen)
1841               {
1842                 tree fndecl = gimple_call_fndecl (stmt), op;
1843                 if (fndecl != NULL_TREE)
1844                   {
1845                     cgraph_node *node = cgraph_node::get (fndecl);
1846                     if (node != NULL && node->simd_clones != NULL)
1847                       {
1848                         unsigned int j, n = gimple_call_num_args (stmt);
1849                         for (j = 0; j < n; j++)
1850                           {
1851                             op = gimple_call_arg (stmt, j);
1852                             if (DECL_P (op)
1853                                 || (REFERENCE_CLASS_P (op)
1854                                     && get_base_address (op)))
1855                               break;
1856                           }
1857                         op = gimple_call_lhs (stmt);
1858                         /* Ignore #pragma omp declare simd functions
1859                            if they don't have data references in the
1860                            call stmt itself.  */
1861                         if (j == n
1862                             && !(op
1863                                  && (DECL_P (op)
1864                                      || (REFERENCE_CLASS_P (op)
1865                                          && get_base_address (op)))))
1866                           continue;
1867                       }
1868                   }
1869               }
1870             if (dump_enabled_p ())
1871               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872                                "not vectorized: loop contains function "
1873                                "calls or data references that cannot "
1874                                "be analyzed\n");
1875             return false;
1876           }
1877       }
1878
1879   /* Analyze the data references and also adjust the minimal
1880      vectorization factor according to the loads and stores.  */
1881
1882   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1883   if (!ok)
1884     {
1885       if (dump_enabled_p ())
1886         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1887                          "bad data references.\n");
1888       return false;
1889     }
1890
1891   /* Classify all cross-iteration scalar data-flow cycles.
1892      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1893   vect_analyze_scalar_cycles (loop_vinfo);
1894
1895   vect_pattern_recog (loop_vinfo);
1896
1897   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1898
1899   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1900      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1901
1902   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1903   if (!ok)
1904     {
1905       if (dump_enabled_p ())
1906         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1907                          "bad data access.\n");
1908       return false;
1909     }
1910
1911   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1912
1913   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1914   if (!ok)
1915     {
1916       if (dump_enabled_p ())
1917         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1918                          "unexpected pattern.\n");
1919       return false;
1920     }
1921
1922   /* While the rest of the analysis below depends on it in some way.  */
1923   fatal = false;
1924
1925   /* Analyze data dependences between the data-refs in the loop
1926      and adjust the maximum vectorization factor according to
1927      the dependences.
1928      FORNOW: fail at the first data dependence that we encounter.  */
1929
1930   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1931   if (!ok
1932       || (max_vf != MAX_VECTORIZATION_FACTOR
1933           && maybe_lt (max_vf, min_vf)))
1934     {
1935       if (dump_enabled_p ())
1936             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937                              "bad data dependence.\n");
1938       return false;
1939     }
1940   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1941
1942   ok = vect_determine_vectorization_factor (loop_vinfo);
1943   if (!ok)
1944     {
1945       if (dump_enabled_p ())
1946         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1947                          "can't determine vectorization factor.\n");
1948       return false;
1949     }
1950   if (max_vf != MAX_VECTORIZATION_FACTOR
1951       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1952     {
1953       if (dump_enabled_p ())
1954         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1955                          "bad data dependence.\n");
1956       return false;
1957     }
1958
1959   /* Compute the scalar iteration cost.  */
1960   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1961
1962   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1963   HOST_WIDE_INT estimated_niter;
1964   unsigned th;
1965   int min_scalar_loop_bound;
1966
1967   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1968   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1969   if (!ok)
1970     return false;
1971
1972   /* If there are any SLP instances mark them as pure_slp.  */
1973   bool slp = vect_make_slp_decision (loop_vinfo);
1974   if (slp)
1975     {
1976       /* Find stmts that need to be both vectorized and SLPed.  */
1977       vect_detect_hybrid_slp (loop_vinfo);
1978
1979       /* Update the vectorization factor based on the SLP decision.  */
1980       vect_update_vf_for_slp (loop_vinfo);
1981     }
1982
1983   /* This is the point where we can re-start analysis with SLP forced off.  */
1984 start_over:
1985
1986   /* Now the vectorization factor is final.  */
1987   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1988   gcc_assert (known_ne (vectorization_factor, 0U));
1989   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1990
1991   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1992     {
1993       dump_printf_loc (MSG_NOTE, vect_location,
1994                        "vectorization_factor = ");
1995       dump_dec (MSG_NOTE, vectorization_factor);
1996       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1997                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1998     }
1999
2000   HOST_WIDE_INT max_niter
2001     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2002   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2003        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < assumed_vf))
2004       || (max_niter != -1
2005           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf))
2006     {
2007       if (dump_enabled_p ())
2008         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009                          "not vectorized: iteration count smaller than "
2010                          "vectorization factor.\n");
2011       return false;
2012     }
2013
2014   /* Analyze the alignment of the data-refs in the loop.
2015      Fail if a data reference is found that cannot be vectorized.  */
2016
2017   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2018   if (!ok)
2019     {
2020       if (dump_enabled_p ())
2021         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2022                          "bad data alignment.\n");
2023       return false;
2024     }
2025
2026   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2027      It is important to call pruning after vect_analyze_data_ref_accesses,
2028      since we use grouping information gathered by interleaving analysis.  */
2029   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2030   if (!ok)
2031     return false;
2032
2033   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2034      vectorization.  */
2035   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2036     {
2037     /* This pass will decide on using loop versioning and/or loop peeling in
2038        order to enhance the alignment of data references in the loop.  */
2039     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2040     if (!ok)
2041       {
2042         if (dump_enabled_p ())
2043           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2044                            "bad data alignment.\n");
2045         return false;
2046       }
2047     }
2048
2049   if (slp)
2050     {
2051       /* Analyze operations in the SLP instances.  Note this may
2052          remove unsupported SLP instances which makes the above
2053          SLP kind detection invalid.  */
2054       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2055       vect_slp_analyze_operations (loop_vinfo);
2056       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2057         goto again;
2058     }
2059
2060   /* Scan all the remaining operations in the loop that are not subject
2061      to SLP and make sure they are vectorizable.  */
2062   ok = vect_analyze_loop_operations (loop_vinfo);
2063   if (!ok)
2064     {
2065       if (dump_enabled_p ())
2066         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2067                          "bad operation or unsupported loop bound.\n");
2068       return false;
2069     }
2070
2071   /* If epilog loop is required because of data accesses with gaps,
2072      one additional iteration needs to be peeled.  Check if there is
2073      enough iterations for vectorization.  */
2074   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2075       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2076     {
2077       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2078       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2079
2080       if (known_lt (wi::to_widest (scalar_niters), vf))
2081         {
2082           if (dump_enabled_p ())
2083             dump_printf_loc (MSG_NOTE, vect_location,
2084                              "loop has no enough iterations to support"
2085                              " peeling for gaps.\n");
2086           return false;
2087         }
2088     }
2089
2090   /* Analyze cost.  Decide if worth while to vectorize.  */
2091   int min_profitable_estimate, min_profitable_iters;
2092   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2093                                       &min_profitable_estimate);
2094
2095   if (min_profitable_iters < 0)
2096     {
2097       if (dump_enabled_p ())
2098         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2099                          "not vectorized: vectorization not profitable.\n");
2100       if (dump_enabled_p ())
2101         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2102                          "not vectorized: vector version will never be "
2103                          "profitable.\n");
2104       goto again;
2105     }
2106
2107   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2108                            * assumed_vf);
2109
2110   /* Use the cost model only if it is more conservative than user specified
2111      threshold.  */
2112   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2113
2114   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2115
2116   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2117       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2118     {
2119       if (dump_enabled_p ())
2120         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2121                          "not vectorized: vectorization not profitable.\n");
2122       if (dump_enabled_p ())
2123         dump_printf_loc (MSG_NOTE, vect_location,
2124                          "not vectorized: iteration count smaller than user "
2125                          "specified loop bound parameter or minimum profitable "
2126                          "iterations (whichever is more conservative).\n");
2127       goto again;
2128     }
2129
2130   estimated_niter
2131     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2132   if (estimated_niter == -1)
2133     estimated_niter = max_niter;
2134   if (estimated_niter != -1
2135       && ((unsigned HOST_WIDE_INT) estimated_niter
2136           < MAX (th, (unsigned) min_profitable_estimate)))
2137     {
2138       if (dump_enabled_p ())
2139         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2140                          "not vectorized: estimated iteration count too "
2141                          "small.\n");
2142       if (dump_enabled_p ())
2143         dump_printf_loc (MSG_NOTE, vect_location,
2144                          "not vectorized: estimated iteration count smaller "
2145                          "than specified loop bound parameter or minimum "
2146                          "profitable iterations (whichever is more "
2147                          "conservative).\n");
2148       goto again;
2149     }
2150
2151   /* Decide whether we need to create an epilogue loop to handle
2152      remaining scalar iterations.  */
2153   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2154
2155   unsigned HOST_WIDE_INT const_vf;
2156   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2157       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2158     {
2159       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2160                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2161                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2162         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2163     }
2164   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2165            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2166            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2167                 < (unsigned) exact_log2 (const_vf))
2168                /* In case of versioning, check if the maximum number of
2169                   iterations is greater than th.  If they are identical,
2170                   the epilogue is unnecessary.  */
2171                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2172                    || ((unsigned HOST_WIDE_INT) max_niter
2173                        > (th / const_vf) * const_vf))))
2174     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2175
2176   /* If an epilogue loop is required make sure we can create one.  */
2177   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2178       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2179     {
2180       if (dump_enabled_p ())
2181         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2182       if (!vect_can_advance_ivs_p (loop_vinfo)
2183           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2184                                            single_exit (LOOP_VINFO_LOOP
2185                                                          (loop_vinfo))))
2186         {
2187           if (dump_enabled_p ())
2188             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2189                              "not vectorized: can't create required "
2190                              "epilog loop\n");
2191           goto again;
2192         }
2193     }
2194
2195   /* During peeling, we need to check if number of loop iterations is
2196      enough for both peeled prolog loop and vector loop.  This check
2197      can be merged along with threshold check of loop versioning, so
2198      increase threshold for this case if necessary.  */
2199   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2200     {
2201       poly_uint64 niters_th;
2202
2203       /* Niters for peeled prolog loop.  */
2204       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2205         {
2206           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2207           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2208
2209           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2210         }
2211       else
2212         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2213
2214       /* Niters for at least one iteration of vectorized loop.  */
2215       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2216       /* One additional iteration because of peeling for gap.  */
2217       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2218         niters_th += 1;
2219       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2220     }
2221
2222   gcc_assert (known_eq (vectorization_factor,
2223                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2224
2225   /* Ok to vectorize!  */
2226   return true;
2227
2228 again:
2229   /* Try again with SLP forced off but if we didn't do any SLP there is
2230      no point in re-trying.  */
2231   if (!slp)
2232     return false;
2233
2234   /* If there are reduction chains re-trying will fail anyway.  */
2235   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2236     return false;
2237
2238   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2239      via interleaving or lane instructions.  */
2240   slp_instance instance;
2241   slp_tree node;
2242   unsigned i, j;
2243   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2244     {
2245       stmt_vec_info vinfo;
2246       vinfo = vinfo_for_stmt
2247           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2248       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2249         continue;
2250       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2251       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2252       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2253       if (! vect_store_lanes_supported (vectype, size)
2254           && ! vect_grouped_store_supported (vectype, size))
2255         return false;
2256       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2257         {
2258           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2259           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2260           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2261           size = STMT_VINFO_GROUP_SIZE (vinfo);
2262           vectype = STMT_VINFO_VECTYPE (vinfo);
2263           if (! vect_load_lanes_supported (vectype, size)
2264               && ! vect_grouped_load_supported (vectype, single_element_p,
2265                                                 size))
2266             return false;
2267         }
2268     }
2269
2270   if (dump_enabled_p ())
2271     dump_printf_loc (MSG_NOTE, vect_location,
2272                      "re-trying with SLP disabled\n");
2273
2274   /* Roll back state appropriately.  No SLP this time.  */
2275   slp = false;
2276   /* Restore vectorization factor as it were without SLP.  */
2277   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2278   /* Free the SLP instances.  */
2279   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2280     vect_free_slp_instance (instance);
2281   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2282   /* Reset SLP type to loop_vect on all stmts.  */
2283   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2284     {
2285       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2286       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2287            !gsi_end_p (si); gsi_next (&si))
2288         {
2289           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2290           STMT_SLP_TYPE (stmt_info) = loop_vect;
2291         }
2292       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2293            !gsi_end_p (si); gsi_next (&si))
2294         {
2295           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2296           STMT_SLP_TYPE (stmt_info) = loop_vect;
2297           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2298             {
2299               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2300               STMT_SLP_TYPE (stmt_info) = loop_vect;
2301               for (gimple_stmt_iterator pi
2302                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2303                    !gsi_end_p (pi); gsi_next (&pi))
2304                 {
2305                   gimple *pstmt = gsi_stmt (pi);
2306                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2307                 }
2308             }
2309         }
2310     }
2311   /* Free optimized alias test DDRS.  */
2312   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2313   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2314   /* Reset target cost data.  */
2315   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2316   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2317     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2318   /* Reset assorted flags.  */
2319   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2320   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2321   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2322   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2323
2324   goto start_over;
2325 }
2326
2327 /* Function vect_analyze_loop.
2328
2329    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2330    for it.  The different analyses will record information in the
2331    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2332    be vectorized.  */
2333 loop_vec_info
2334 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2335 {
2336   loop_vec_info loop_vinfo;
2337   auto_vector_sizes vector_sizes;
2338
2339   /* Autodetect first vector size we try.  */
2340   current_vector_size = 0;
2341   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2342   unsigned int next_size = 0;
2343
2344   if (dump_enabled_p ())
2345     dump_printf_loc (MSG_NOTE, vect_location,
2346                      "===== analyze_loop_nest =====\n");
2347
2348   if (loop_outer (loop)
2349       && loop_vec_info_for_loop (loop_outer (loop))
2350       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2351     {
2352       if (dump_enabled_p ())
2353         dump_printf_loc (MSG_NOTE, vect_location,
2354                          "outer-loop already vectorized.\n");
2355       return NULL;
2356     }
2357
2358   poly_uint64 autodetected_vector_size = 0;
2359   while (1)
2360     {
2361       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2362       loop_vinfo = vect_analyze_loop_form (loop);
2363       if (!loop_vinfo)
2364         {
2365           if (dump_enabled_p ())
2366             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2367                              "bad loop form.\n");
2368           return NULL;
2369         }
2370
2371       bool fatal = false;
2372
2373       if (orig_loop_vinfo)
2374         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2375
2376       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2377         {
2378           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2379
2380           return loop_vinfo;
2381         }
2382
2383       delete loop_vinfo;
2384
2385       if (next_size == 0)
2386         autodetected_vector_size = current_vector_size;
2387
2388       if (next_size < vector_sizes.length ()
2389           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2390         next_size += 1;
2391
2392       if (fatal
2393           || next_size == vector_sizes.length ()
2394           || known_eq (current_vector_size, 0U))
2395         return NULL;
2396
2397       /* Try the next biggest vector size.  */
2398       current_vector_size = vector_sizes[next_size++];
2399       if (dump_enabled_p ())
2400         {
2401           dump_printf_loc (MSG_NOTE, vect_location,
2402                            "***** Re-trying analysis with "
2403                            "vector size ");
2404           dump_dec (MSG_NOTE, current_vector_size);
2405           dump_printf (MSG_NOTE, "\n");
2406         }
2407     }
2408 }
2409
2410
2411 /* Function reduction_fn_for_scalar_code
2412
2413    Input:
2414    CODE - tree_code of a reduction operations.
2415
2416    Output:
2417    REDUC_FN - the corresponding internal function to be used to reduce the
2418       vector of partial results into a single scalar result, or IFN_LAST
2419       if the operation is a supported reduction operation, but does not have
2420       such an internal function.
2421
2422    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2423
2424 static bool
2425 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2426 {
2427   switch (code)
2428     {
2429       case MAX_EXPR:
2430         *reduc_fn = IFN_REDUC_MAX;
2431         return true;
2432
2433       case MIN_EXPR:
2434         *reduc_fn = IFN_REDUC_MIN;
2435         return true;
2436
2437       case PLUS_EXPR:
2438         *reduc_fn = IFN_REDUC_PLUS;
2439         return true;
2440
2441       case MULT_EXPR:
2442       case MINUS_EXPR:
2443       case BIT_IOR_EXPR:
2444       case BIT_XOR_EXPR:
2445       case BIT_AND_EXPR:
2446         *reduc_fn = IFN_LAST;
2447         return true;
2448
2449       default:
2450        return false;
2451     }
2452 }
2453
2454
2455 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2456    STMT is printed with a message MSG. */
2457
2458 static void
2459 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2460 {
2461   dump_printf_loc (msg_type, vect_location, "%s", msg);
2462   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2463 }
2464
2465
2466 /* Detect SLP reduction of the form:
2467
2468    #a1 = phi <a5, a0>
2469    a2 = operation (a1)
2470    a3 = operation (a2)
2471    a4 = operation (a3)
2472    a5 = operation (a4)
2473
2474    #a = phi <a5>
2475
2476    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2477    FIRST_STMT is the first reduction stmt in the chain
2478    (a2 = operation (a1)).
2479
2480    Return TRUE if a reduction chain was detected.  */
2481
2482 static bool
2483 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2484                        gimple *first_stmt)
2485 {
2486   struct loop *loop = (gimple_bb (phi))->loop_father;
2487   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2488   enum tree_code code;
2489   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2490   stmt_vec_info use_stmt_info, current_stmt_info;
2491   tree lhs;
2492   imm_use_iterator imm_iter;
2493   use_operand_p use_p;
2494   int nloop_uses, size = 0, n_out_of_loop_uses;
2495   bool found = false;
2496
2497   if (loop != vect_loop)
2498     return false;
2499
2500   lhs = PHI_RESULT (phi);
2501   code = gimple_assign_rhs_code (first_stmt);
2502   while (1)
2503     {
2504       nloop_uses = 0;
2505       n_out_of_loop_uses = 0;
2506       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2507         {
2508           gimple *use_stmt = USE_STMT (use_p);
2509           if (is_gimple_debug (use_stmt))
2510             continue;
2511
2512           /* Check if we got back to the reduction phi.  */
2513           if (use_stmt == phi)
2514             {
2515               loop_use_stmt = use_stmt;
2516               found = true;
2517               break;
2518             }
2519
2520           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2521             {
2522               loop_use_stmt = use_stmt;
2523               nloop_uses++;
2524             }
2525            else
2526              n_out_of_loop_uses++;
2527
2528            /* There are can be either a single use in the loop or two uses in
2529               phi nodes.  */
2530            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2531              return false;
2532         }
2533
2534       if (found)
2535         break;
2536
2537       /* We reached a statement with no loop uses.  */
2538       if (nloop_uses == 0)
2539         return false;
2540
2541       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2542       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2543         return false;
2544
2545       if (!is_gimple_assign (loop_use_stmt)
2546           || code != gimple_assign_rhs_code (loop_use_stmt)
2547           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2548         return false;
2549
2550       /* Insert USE_STMT into reduction chain.  */
2551       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2552       if (current_stmt)
2553         {
2554           current_stmt_info = vinfo_for_stmt (current_stmt);
2555           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2556           GROUP_FIRST_ELEMENT (use_stmt_info)
2557             = GROUP_FIRST_ELEMENT (current_stmt_info);
2558         }
2559       else
2560         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2561
2562       lhs = gimple_assign_lhs (loop_use_stmt);
2563       current_stmt = loop_use_stmt;
2564       size++;
2565    }
2566
2567   if (!found || loop_use_stmt != phi || size < 2)
2568     return false;
2569
2570   /* Swap the operands, if needed, to make the reduction operand be the second
2571      operand.  */
2572   lhs = PHI_RESULT (phi);
2573   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2574   while (next_stmt)
2575     {
2576       if (gimple_assign_rhs2 (next_stmt) == lhs)
2577         {
2578           tree op = gimple_assign_rhs1 (next_stmt);
2579           gimple *def_stmt = NULL;
2580
2581           if (TREE_CODE (op) == SSA_NAME)
2582             def_stmt = SSA_NAME_DEF_STMT (op);
2583
2584           /* Check that the other def is either defined in the loop
2585              ("vect_internal_def"), or it's an induction (defined by a
2586              loop-header phi-node).  */
2587           if (def_stmt
2588               && gimple_bb (def_stmt)
2589               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2590               && (is_gimple_assign (def_stmt)
2591                   || is_gimple_call (def_stmt)
2592                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2593                            == vect_induction_def
2594                   || (gimple_code (def_stmt) == GIMPLE_PHI
2595                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2596                                   == vect_internal_def
2597                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2598             {
2599               lhs = gimple_assign_lhs (next_stmt);
2600               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2601               continue;
2602             }
2603
2604           return false;
2605         }
2606       else
2607         {
2608           tree op = gimple_assign_rhs2 (next_stmt);
2609           gimple *def_stmt = NULL;
2610
2611           if (TREE_CODE (op) == SSA_NAME)
2612             def_stmt = SSA_NAME_DEF_STMT (op);
2613
2614           /* Check that the other def is either defined in the loop
2615             ("vect_internal_def"), or it's an induction (defined by a
2616             loop-header phi-node).  */
2617           if (def_stmt
2618               && gimple_bb (def_stmt)
2619               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2620               && (is_gimple_assign (def_stmt)
2621                   || is_gimple_call (def_stmt)
2622                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2623                               == vect_induction_def
2624                   || (gimple_code (def_stmt) == GIMPLE_PHI
2625                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2626                                   == vect_internal_def
2627                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2628             {
2629               if (dump_enabled_p ())
2630                 {
2631                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2632                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2633                 }
2634
2635               swap_ssa_operands (next_stmt,
2636                                  gimple_assign_rhs1_ptr (next_stmt),
2637                                  gimple_assign_rhs2_ptr (next_stmt));
2638               update_stmt (next_stmt);
2639
2640               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2641                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2642             }
2643           else
2644             return false;
2645         }
2646
2647       lhs = gimple_assign_lhs (next_stmt);
2648       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2649     }
2650
2651   /* Save the chain for further analysis in SLP detection.  */
2652   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2653   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2654   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2655
2656   return true;
2657 }
2658
2659
2660 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2661    reduction operation CODE has a handled computation expression.  */
2662
2663 bool
2664 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2665                       enum tree_code code)
2666 {
2667   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2668   auto_bitmap visited;
2669   tree lookfor = PHI_RESULT (phi);
2670   ssa_op_iter curri;
2671   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2672   while (USE_FROM_PTR (curr) != loop_arg)
2673     curr = op_iter_next_use (&curri);
2674   curri.i = curri.numops;
2675   do
2676     {
2677       path.safe_push (std::make_pair (curri, curr));
2678       tree use = USE_FROM_PTR (curr);
2679       if (use == lookfor)
2680         break;
2681       gimple *def = SSA_NAME_DEF_STMT (use);
2682       if (gimple_nop_p (def)
2683           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2684         {
2685 pop:
2686           do
2687             {
2688               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2689               curri = x.first;
2690               curr = x.second;
2691               do
2692                 curr = op_iter_next_use (&curri);
2693               /* Skip already visited or non-SSA operands (from iterating
2694                  over PHI args).  */
2695               while (curr != NULL_USE_OPERAND_P
2696                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2697                          || ! bitmap_set_bit (visited,
2698                                               SSA_NAME_VERSION
2699                                                 (USE_FROM_PTR (curr)))));
2700             }
2701           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2702           if (curr == NULL_USE_OPERAND_P)
2703             break;
2704         }
2705       else
2706         {
2707           if (gimple_code (def) == GIMPLE_PHI)
2708             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2709           else
2710             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2711           while (curr != NULL_USE_OPERAND_P
2712                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2713                      || ! bitmap_set_bit (visited,
2714                                           SSA_NAME_VERSION
2715                                             (USE_FROM_PTR (curr)))))
2716             curr = op_iter_next_use (&curri);
2717           if (curr == NULL_USE_OPERAND_P)
2718             goto pop;
2719         }
2720     }
2721   while (1);
2722   if (dump_file && (dump_flags & TDF_DETAILS))
2723     {
2724       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2725       unsigned i;
2726       std::pair<ssa_op_iter, use_operand_p> *x;
2727       FOR_EACH_VEC_ELT (path, i, x)
2728         {
2729           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2730           dump_printf (MSG_NOTE, " ");
2731         }
2732       dump_printf (MSG_NOTE, "\n");
2733     }
2734
2735   /* Check whether the reduction path detected is valid.  */
2736   bool fail = path.length () == 0;
2737   bool neg = false;
2738   for (unsigned i = 1; i < path.length (); ++i)
2739     {
2740       gimple *use_stmt = USE_STMT (path[i].second);
2741       tree op = USE_FROM_PTR (path[i].second);
2742       if (! has_single_use (op)
2743           || ! is_gimple_assign (use_stmt))
2744         {
2745           fail = true;
2746           break;
2747         }
2748       if (gimple_assign_rhs_code (use_stmt) != code)
2749         {
2750           if (code == PLUS_EXPR
2751               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2752             {
2753               /* Track whether we negate the reduction value each iteration.  */
2754               if (gimple_assign_rhs2 (use_stmt) == op)
2755                 neg = ! neg;
2756             }
2757           else
2758             {
2759               fail = true;
2760               break;
2761             }
2762         }
2763     }
2764   return ! fail && ! neg;
2765 }
2766
2767
2768 /* Function vect_is_simple_reduction
2769
2770    (1) Detect a cross-iteration def-use cycle that represents a simple
2771    reduction computation.  We look for the following pattern:
2772
2773    loop_header:
2774      a1 = phi < a0, a2 >
2775      a3 = ...
2776      a2 = operation (a3, a1)
2777
2778    or
2779
2780    a3 = ...
2781    loop_header:
2782      a1 = phi < a0, a2 >
2783      a2 = operation (a3, a1)
2784
2785    such that:
2786    1. operation is commutative and associative and it is safe to
2787       change the order of the computation
2788    2. no uses for a2 in the loop (a2 is used out of the loop)
2789    3. no uses of a1 in the loop besides the reduction operation
2790    4. no uses of a1 outside the loop.
2791
2792    Conditions 1,4 are tested here.
2793    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2794
2795    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2796    nested cycles.
2797
2798    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2799    reductions:
2800
2801      a1 = phi < a0, a2 >
2802      inner loop (def of a3)
2803      a2 = phi < a3 >
2804
2805    (4) Detect condition expressions, ie:
2806      for (int i = 0; i < N; i++)
2807        if (a[i] < val)
2808         ret_val = a[i];
2809
2810 */
2811
2812 static gimple *
2813 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2814                           bool *double_reduc,
2815                           bool need_wrapping_integral_overflow,
2816                           enum vect_reduction_type *v_reduc_type)
2817 {
2818   struct loop *loop = (gimple_bb (phi))->loop_father;
2819   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2820   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2821   enum tree_code orig_code, code;
2822   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2823   tree type;
2824   int nloop_uses;
2825   tree name;
2826   imm_use_iterator imm_iter;
2827   use_operand_p use_p;
2828   bool phi_def;
2829
2830   *double_reduc = false;
2831   *v_reduc_type = TREE_CODE_REDUCTION;
2832
2833   tree phi_name = PHI_RESULT (phi);
2834   /* ???  If there are no uses of the PHI result the inner loop reduction
2835      won't be detected as possibly double-reduction by vectorizable_reduction
2836      because that tries to walk the PHI arg from the preheader edge which
2837      can be constant.  See PR60382.  */
2838   if (has_zero_uses (phi_name))
2839     return NULL;
2840   nloop_uses = 0;
2841   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2842     {
2843       gimple *use_stmt = USE_STMT (use_p);
2844       if (is_gimple_debug (use_stmt))
2845         continue;
2846
2847       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2848         {
2849           if (dump_enabled_p ())
2850             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2851                              "intermediate value used outside loop.\n");
2852
2853           return NULL;
2854         }
2855
2856       nloop_uses++;
2857       if (nloop_uses > 1)
2858         {
2859           if (dump_enabled_p ())
2860             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2861                              "reduction value used in loop.\n");
2862           return NULL;
2863         }
2864
2865       phi_use_stmt = use_stmt;
2866     }
2867
2868   edge latch_e = loop_latch_edge (loop);
2869   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2870   if (TREE_CODE (loop_arg) != SSA_NAME)
2871     {
2872       if (dump_enabled_p ())
2873         {
2874           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2875                            "reduction: not ssa_name: ");
2876           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2877           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2878         }
2879       return NULL;
2880     }
2881
2882   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2883   if (is_gimple_assign (def_stmt))
2884     {
2885       name = gimple_assign_lhs (def_stmt);
2886       phi_def = false;
2887     }
2888   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2889     {
2890       name = PHI_RESULT (def_stmt);
2891       phi_def = true;
2892     }
2893   else
2894     {
2895       if (dump_enabled_p ())
2896         {
2897           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2898                            "reduction: unhandled reduction operation: ");
2899           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2900         }
2901       return NULL;
2902     }
2903
2904   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2905     return NULL;
2906
2907   nloop_uses = 0;
2908   auto_vec<gphi *, 3> lcphis;
2909   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2910     {
2911       gimple *use_stmt = USE_STMT (use_p);
2912       if (is_gimple_debug (use_stmt))
2913         continue;
2914       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2915         nloop_uses++;
2916       else
2917         /* We can have more than one loop-closed PHI.  */
2918         lcphis.safe_push (as_a <gphi *> (use_stmt));
2919       if (nloop_uses > 1)
2920         {
2921           if (dump_enabled_p ())
2922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2923                              "reduction used in loop.\n");
2924           return NULL;
2925         }
2926     }
2927
2928   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2929      defined in the inner loop.  */
2930   if (phi_def)
2931     {
2932       op1 = PHI_ARG_DEF (def_stmt, 0);
2933
2934       if (gimple_phi_num_args (def_stmt) != 1
2935           || TREE_CODE (op1) != SSA_NAME)
2936         {
2937           if (dump_enabled_p ())
2938             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2939                              "unsupported phi node definition.\n");
2940
2941           return NULL;
2942         }
2943
2944       def1 = SSA_NAME_DEF_STMT (op1);
2945       if (gimple_bb (def1)
2946           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2947           && loop->inner
2948           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2949           && is_gimple_assign (def1)
2950           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2951         {
2952           if (dump_enabled_p ())
2953             report_vect_op (MSG_NOTE, def_stmt,
2954                             "detected double reduction: ");
2955
2956           *double_reduc = true;
2957           return def_stmt;
2958         }
2959
2960       return NULL;
2961     }
2962
2963   /* If we are vectorizing an inner reduction we are executing that
2964      in the original order only in case we are not dealing with a
2965      double reduction.  */
2966   bool check_reduction = true;
2967   if (flow_loop_nested_p (vect_loop, loop))
2968     {
2969       gphi *lcphi;
2970       unsigned i;
2971       check_reduction = false;
2972       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2973         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2974           {
2975             gimple *use_stmt = USE_STMT (use_p);
2976             if (is_gimple_debug (use_stmt))
2977               continue;
2978             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2979               check_reduction = true;
2980           }
2981     }
2982
2983   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2984   code = orig_code = gimple_assign_rhs_code (def_stmt);
2985
2986   /* We can handle "res -= x[i]", which is non-associative by
2987      simply rewriting this into "res += -x[i]".  Avoid changing
2988      gimple instruction for the first simple tests and only do this
2989      if we're allowed to change code at all.  */
2990   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2991     code = PLUS_EXPR;
2992
2993   if (code == COND_EXPR)
2994     {
2995       if (! nested_in_vect_loop)
2996         *v_reduc_type = COND_REDUCTION;
2997
2998       op3 = gimple_assign_rhs1 (def_stmt);
2999       if (COMPARISON_CLASS_P (op3))
3000         {
3001           op4 = TREE_OPERAND (op3, 1);
3002           op3 = TREE_OPERAND (op3, 0);
3003         }
3004       if (op3 == phi_name || op4 == phi_name)
3005         {
3006           if (dump_enabled_p ())
3007             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3008                             "reduction: condition depends on previous"
3009                             " iteration: ");
3010           return NULL;
3011         }
3012
3013       op1 = gimple_assign_rhs2 (def_stmt);
3014       op2 = gimple_assign_rhs3 (def_stmt);
3015     }
3016   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3017     {
3018       if (dump_enabled_p ())
3019         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3020                         "reduction: not commutative/associative: ");
3021       return NULL;
3022     }
3023   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3024     {
3025       op1 = gimple_assign_rhs1 (def_stmt);
3026       op2 = gimple_assign_rhs2 (def_stmt);
3027     }
3028   else
3029     {
3030       if (dump_enabled_p ())
3031         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3032                         "reduction: not handled operation: ");
3033       return NULL;
3034     }
3035
3036   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3037     {
3038       if (dump_enabled_p ())
3039         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3040                         "reduction: both uses not ssa_names: ");
3041
3042       return NULL;
3043     }
3044
3045   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3046   if ((TREE_CODE (op1) == SSA_NAME
3047        && !types_compatible_p (type,TREE_TYPE (op1)))
3048       || (TREE_CODE (op2) == SSA_NAME
3049           && !types_compatible_p (type, TREE_TYPE (op2)))
3050       || (op3 && TREE_CODE (op3) == SSA_NAME
3051           && !types_compatible_p (type, TREE_TYPE (op3)))
3052       || (op4 && TREE_CODE (op4) == SSA_NAME
3053           && !types_compatible_p (type, TREE_TYPE (op4))))
3054     {
3055       if (dump_enabled_p ())
3056         {
3057           dump_printf_loc (MSG_NOTE, vect_location,
3058                            "reduction: multiple types: operation type: ");
3059           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3060           dump_printf (MSG_NOTE, ", operands types: ");
3061           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3062                              TREE_TYPE (op1));
3063           dump_printf (MSG_NOTE, ",");
3064           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3065                              TREE_TYPE (op2));
3066           if (op3)
3067             {
3068               dump_printf (MSG_NOTE, ",");
3069               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3070                                  TREE_TYPE (op3));
3071             }
3072
3073           if (op4)
3074             {
3075               dump_printf (MSG_NOTE, ",");
3076               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3077                                  TREE_TYPE (op4));
3078             }
3079           dump_printf (MSG_NOTE, "\n");
3080         }
3081
3082       return NULL;
3083     }
3084
3085   /* Check that it's ok to change the order of the computation.
3086      Generally, when vectorizing a reduction we change the order of the
3087      computation.  This may change the behavior of the program in some
3088      cases, so we need to check that this is ok.  One exception is when
3089      vectorizing an outer-loop: the inner-loop is executed sequentially,
3090      and therefore vectorizing reductions in the inner-loop during
3091      outer-loop vectorization is safe.  */
3092
3093   if (*v_reduc_type != COND_REDUCTION
3094       && check_reduction)
3095     {
3096       /* CHECKME: check for !flag_finite_math_only too?  */
3097       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3098         {
3099           /* Changing the order of operations changes the semantics.  */
3100           if (dump_enabled_p ())
3101             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3102                         "reduction: unsafe fp math optimization: ");
3103           return NULL;
3104         }
3105       else if (INTEGRAL_TYPE_P (type))
3106         {
3107           if (!operation_no_trapping_overflow (type, code))
3108             {
3109               /* Changing the order of operations changes the semantics.  */
3110               if (dump_enabled_p ())
3111                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3112                                 "reduction: unsafe int math optimization"
3113                                 " (overflow traps): ");
3114               return NULL;
3115             }
3116           if (need_wrapping_integral_overflow
3117               && !TYPE_OVERFLOW_WRAPS (type)
3118               && operation_can_overflow (code))
3119             {
3120               /* Changing the order of operations changes the semantics.  */
3121               if (dump_enabled_p ())
3122                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3123                                 "reduction: unsafe int math optimization"
3124                                 " (overflow doesn't wrap): ");
3125               return NULL;
3126             }
3127         }
3128       else if (SAT_FIXED_POINT_TYPE_P (type))
3129         {
3130           /* Changing the order of operations changes the semantics.  */
3131           if (dump_enabled_p ())
3132           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3133                           "reduction: unsafe fixed-point math optimization: ");
3134           return NULL;
3135         }
3136     }
3137
3138   /* Reduction is safe. We're dealing with one of the following:
3139      1) integer arithmetic and no trapv
3140      2) floating point arithmetic, and special flags permit this optimization
3141      3) nested cycle (i.e., outer loop vectorization).  */
3142   if (TREE_CODE (op1) == SSA_NAME)
3143     def1 = SSA_NAME_DEF_STMT (op1);
3144
3145   if (TREE_CODE (op2) == SSA_NAME)
3146     def2 = SSA_NAME_DEF_STMT (op2);
3147
3148   if (code != COND_EXPR
3149       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3150     {
3151       if (dump_enabled_p ())
3152         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3153       return NULL;
3154     }
3155
3156   /* Check that one def is the reduction def, defined by PHI,
3157      the other def is either defined in the loop ("vect_internal_def"),
3158      or it's an induction (defined by a loop-header phi-node).  */
3159
3160   if (def2 && def2 == phi
3161       && (code == COND_EXPR
3162           || !def1 || gimple_nop_p (def1)
3163           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3164           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3165               && (is_gimple_assign (def1)
3166                   || is_gimple_call (def1)
3167                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3168                       == vect_induction_def
3169                   || (gimple_code (def1) == GIMPLE_PHI
3170                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3171                           == vect_internal_def
3172                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3173     {
3174       if (dump_enabled_p ())
3175         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3176       return def_stmt;
3177     }
3178
3179   if (def1 && def1 == phi
3180       && (code == COND_EXPR
3181           || !def2 || gimple_nop_p (def2)
3182           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3183           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3184               && (is_gimple_assign (def2)
3185                   || is_gimple_call (def2)
3186                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3187                        == vect_induction_def
3188                   || (gimple_code (def2) == GIMPLE_PHI
3189                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3190                            == vect_internal_def
3191                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3192     {
3193       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3194         {
3195           /* Check if we can swap operands (just for simplicity - so that
3196              the rest of the code can assume that the reduction variable
3197              is always the last (second) argument).  */
3198           if (code == COND_EXPR)
3199             {
3200               /* Swap cond_expr by inverting the condition.  */
3201               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3202               enum tree_code invert_code = ERROR_MARK;
3203               enum tree_code cond_code = TREE_CODE (cond_expr);
3204
3205               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3206                 {
3207                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3208                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3209                 }
3210               if (invert_code != ERROR_MARK)
3211                 {
3212                   TREE_SET_CODE (cond_expr, invert_code);
3213                   swap_ssa_operands (def_stmt,
3214                                      gimple_assign_rhs2_ptr (def_stmt),
3215                                      gimple_assign_rhs3_ptr (def_stmt));
3216                 }
3217               else
3218                 {
3219                   if (dump_enabled_p ())
3220                     report_vect_op (MSG_NOTE, def_stmt,
3221                                     "detected reduction: cannot swap operands "
3222                                     "for cond_expr");
3223                   return NULL;
3224                 }
3225             }
3226           else
3227             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3228                                gimple_assign_rhs2_ptr (def_stmt));
3229
3230           if (dump_enabled_p ())
3231             report_vect_op (MSG_NOTE, def_stmt,
3232                             "detected reduction: need to swap operands: ");
3233
3234           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3235             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3236         }
3237       else
3238         {
3239           if (dump_enabled_p ())
3240             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3241         }
3242
3243       return def_stmt;
3244     }
3245
3246   /* Try to find SLP reduction chain.  */
3247   if (! nested_in_vect_loop
3248       && code != COND_EXPR
3249       && orig_code != MINUS_EXPR
3250       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3251     {
3252       if (dump_enabled_p ())
3253         report_vect_op (MSG_NOTE, def_stmt,
3254                         "reduction: detected reduction chain: ");
3255
3256       return def_stmt;
3257     }
3258
3259   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3260   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3261   while (first)
3262     {
3263       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3264       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3265       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3266       first = next;
3267     }
3268
3269   /* Look for the expression computing loop_arg from loop PHI result.  */
3270   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3271                             code))
3272     return def_stmt;
3273
3274   if (dump_enabled_p ())
3275     {
3276       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3277                       "reduction: unknown pattern: ");
3278     }
3279
3280   return NULL;
3281 }
3282
3283 /* Wrapper around vect_is_simple_reduction, which will modify code
3284    in-place if it enables detection of more reductions.  Arguments
3285    as there.  */
3286
3287 gimple *
3288 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3289                              bool *double_reduc,
3290                              bool need_wrapping_integral_overflow)
3291 {
3292   enum vect_reduction_type v_reduc_type;
3293   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3294                                           need_wrapping_integral_overflow,
3295                                           &v_reduc_type);
3296   if (def)
3297     {
3298       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3299       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3300       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3301       reduc_def_info = vinfo_for_stmt (def);
3302       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3303     }
3304   return def;
3305 }
3306
3307 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3308 int
3309 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3310                              int *peel_iters_epilogue,
3311                              stmt_vector_for_cost *scalar_cost_vec,
3312                              stmt_vector_for_cost *prologue_cost_vec,
3313                              stmt_vector_for_cost *epilogue_cost_vec)
3314 {
3315   int retval = 0;
3316   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3317
3318   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3319     {
3320       *peel_iters_epilogue = assumed_vf / 2;
3321       if (dump_enabled_p ())
3322         dump_printf_loc (MSG_NOTE, vect_location,
3323                          "cost model: epilogue peel iters set to vf/2 "
3324                          "because loop iterations are unknown .\n");
3325
3326       /* If peeled iterations are known but number of scalar loop
3327          iterations are unknown, count a taken branch per peeled loop.  */
3328       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3329                                  NULL, 0, vect_prologue);
3330       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3331                                  NULL, 0, vect_epilogue);
3332     }
3333   else
3334     {
3335       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3336       peel_iters_prologue = niters < peel_iters_prologue ?
3337                             niters : peel_iters_prologue;
3338       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3339       /* If we need to peel for gaps, but no peeling is required, we have to
3340          peel VF iterations.  */
3341       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3342         *peel_iters_epilogue = assumed_vf;
3343     }
3344
3345   stmt_info_for_cost *si;
3346   int j;
3347   if (peel_iters_prologue)
3348     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3349         {
3350           stmt_vec_info stmt_info
3351             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3352           retval += record_stmt_cost (prologue_cost_vec,
3353                                       si->count * peel_iters_prologue,
3354                                       si->kind, stmt_info, si->misalign,
3355                                       vect_prologue);
3356         }
3357   if (*peel_iters_epilogue)
3358     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3359         {
3360           stmt_vec_info stmt_info
3361             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3362           retval += record_stmt_cost (epilogue_cost_vec,
3363                                       si->count * *peel_iters_epilogue,
3364                                       si->kind, stmt_info, si->misalign,
3365                                       vect_epilogue);
3366         }
3367
3368   return retval;
3369 }
3370
3371 /* Function vect_estimate_min_profitable_iters
3372
3373    Return the number of iterations required for the vector version of the
3374    loop to be profitable relative to the cost of the scalar version of the
3375    loop.
3376
3377    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3378    of iterations for vectorization.  -1 value means loop vectorization
3379    is not profitable.  This returned value may be used for dynamic
3380    profitability check.
3381
3382    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3383    for static check against estimated number of iterations.  */
3384
3385 static void
3386 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3387                                     int *ret_min_profitable_niters,
3388                                     int *ret_min_profitable_estimate)
3389 {
3390   int min_profitable_iters;
3391   int min_profitable_estimate;
3392   int peel_iters_prologue;
3393   int peel_iters_epilogue;
3394   unsigned vec_inside_cost = 0;
3395   int vec_outside_cost = 0;
3396   unsigned vec_prologue_cost = 0;
3397   unsigned vec_epilogue_cost = 0;
3398   int scalar_single_iter_cost = 0;
3399   int scalar_outside_cost = 0;
3400   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3401   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3402   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3403
3404   /* Cost model disabled.  */
3405   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3406     {
3407       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3408       *ret_min_profitable_niters = 0;
3409       *ret_min_profitable_estimate = 0;
3410       return;
3411     }
3412
3413   /* Requires loop versioning tests to handle misalignment.  */
3414   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3415     {
3416       /*  FIXME: Make cost depend on complexity of individual check.  */
3417       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3418       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3419                             vect_prologue);
3420       dump_printf (MSG_NOTE,
3421                    "cost model: Adding cost of checks for loop "
3422                    "versioning to treat misalignment.\n");
3423     }
3424
3425   /* Requires loop versioning with alias checks.  */
3426   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3427     {
3428       /*  FIXME: Make cost depend on complexity of individual check.  */
3429       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3430       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3431                             vect_prologue);
3432       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3433       if (len)
3434         /* Count LEN - 1 ANDs and LEN comparisons.  */
3435         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3436                               NULL, 0, vect_prologue);
3437       dump_printf (MSG_NOTE,
3438                    "cost model: Adding cost of checks for loop "
3439                    "versioning aliasing.\n");
3440     }
3441
3442   /* Requires loop versioning with niter checks.  */
3443   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3444     {
3445       /*  FIXME: Make cost depend on complexity of individual check.  */
3446       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3447                             vect_prologue);
3448       dump_printf (MSG_NOTE,
3449                    "cost model: Adding cost of checks for loop "
3450                    "versioning niters.\n");
3451     }
3452
3453   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3454     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3455                           vect_prologue);
3456
3457   /* Count statements in scalar loop.  Using this as scalar cost for a single
3458      iteration for now.
3459
3460      TODO: Add outer loop support.
3461
3462      TODO: Consider assigning different costs to different scalar
3463      statements.  */
3464
3465   scalar_single_iter_cost
3466     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3467
3468   /* Add additional cost for the peeled instructions in prologue and epilogue
3469      loop.
3470
3471      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3472      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3473
3474      TODO: Build an expression that represents peel_iters for prologue and
3475      epilogue to be used in a run-time test.  */
3476
3477   if (npeel  < 0)
3478     {
3479       peel_iters_prologue = assumed_vf / 2;
3480       dump_printf (MSG_NOTE, "cost model: "
3481                    "prologue peel iters set to vf/2.\n");
3482
3483       /* If peeling for alignment is unknown, loop bound of main loop becomes
3484          unknown.  */
3485       peel_iters_epilogue = assumed_vf / 2;
3486       dump_printf (MSG_NOTE, "cost model: "
3487                    "epilogue peel iters set to vf/2 because "
3488                    "peeling for alignment is unknown.\n");
3489
3490       /* If peeled iterations are unknown, count a taken branch and a not taken
3491          branch per peeled loop. Even if scalar loop iterations are known,
3492          vector iterations are not known since peeled prologue iterations are
3493          not known. Hence guards remain the same.  */
3494       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3495                             NULL, 0, vect_prologue);
3496       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3497                             NULL, 0, vect_prologue);
3498       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3499                             NULL, 0, vect_epilogue);
3500       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3501                             NULL, 0, vect_epilogue);
3502       stmt_info_for_cost *si;
3503       int j;
3504       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3505         {
3506           struct _stmt_vec_info *stmt_info
3507             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3508           (void) add_stmt_cost (target_cost_data,
3509                                 si->count * peel_iters_prologue,
3510                                 si->kind, stmt_info, si->misalign,
3511                                 vect_prologue);
3512           (void) add_stmt_cost (target_cost_data,
3513                                 si->count * peel_iters_epilogue,
3514                                 si->kind, stmt_info, si->misalign,
3515                                 vect_epilogue);
3516         }
3517     }
3518   else
3519     {
3520       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3521       stmt_info_for_cost *si;
3522       int j;
3523       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3524
3525       prologue_cost_vec.create (2);
3526       epilogue_cost_vec.create (2);
3527       peel_iters_prologue = npeel;
3528
3529       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3530                                           &peel_iters_epilogue,
3531                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3532                                             (loop_vinfo),
3533                                           &prologue_cost_vec,
3534                                           &epilogue_cost_vec);
3535
3536       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3537         {
3538           struct _stmt_vec_info *stmt_info
3539             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3540           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3541                                 si->misalign, vect_prologue);
3542         }
3543
3544       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3545         {
3546           struct _stmt_vec_info *stmt_info
3547             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3548           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3549                                 si->misalign, vect_epilogue);
3550         }
3551
3552       prologue_cost_vec.release ();
3553       epilogue_cost_vec.release ();
3554     }
3555
3556   /* FORNOW: The scalar outside cost is incremented in one of the
3557      following ways:
3558
3559      1. The vectorizer checks for alignment and aliasing and generates
3560      a condition that allows dynamic vectorization.  A cost model
3561      check is ANDED with the versioning condition.  Hence scalar code
3562      path now has the added cost of the versioning check.
3563
3564        if (cost > th & versioning_check)
3565          jmp to vector code
3566
3567      Hence run-time scalar is incremented by not-taken branch cost.
3568
3569      2. The vectorizer then checks if a prologue is required.  If the
3570      cost model check was not done before during versioning, it has to
3571      be done before the prologue check.
3572
3573        if (cost <= th)
3574          prologue = scalar_iters
3575        if (prologue == 0)
3576          jmp to vector code
3577        else
3578          execute prologue
3579        if (prologue == num_iters)
3580          go to exit
3581
3582      Hence the run-time scalar cost is incremented by a taken branch,
3583      plus a not-taken branch, plus a taken branch cost.
3584
3585      3. The vectorizer then checks if an epilogue is required.  If the
3586      cost model check was not done before during prologue check, it
3587      has to be done with the epilogue check.
3588
3589        if (prologue == 0)
3590          jmp to vector code
3591        else
3592          execute prologue
3593        if (prologue == num_iters)
3594          go to exit
3595        vector code:
3596          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3597            jmp to epilogue
3598
3599      Hence the run-time scalar cost should be incremented by 2 taken
3600      branches.
3601
3602      TODO: The back end may reorder the BBS's differently and reverse
3603      conditions/branch directions.  Change the estimates below to
3604      something more reasonable.  */
3605
3606   /* If the number of iterations is known and we do not do versioning, we can
3607      decide whether to vectorize at compile time.  Hence the scalar version
3608      do not carry cost model guard costs.  */
3609   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3610       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3611     {
3612       /* Cost model check occurs at versioning.  */
3613       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3614         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3615       else
3616         {
3617           /* Cost model check occurs at prologue generation.  */
3618           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3619             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3620               + vect_get_stmt_cost (cond_branch_not_taken);
3621           /* Cost model check occurs at epilogue generation.  */
3622           else
3623             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3624         }
3625     }
3626
3627   /* Complete the target-specific cost calculations.  */
3628   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3629                &vec_inside_cost, &vec_epilogue_cost);
3630
3631   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3632
3633   if (dump_enabled_p ())
3634     {
3635       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3636       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3637                    vec_inside_cost);
3638       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3639                    vec_prologue_cost);
3640       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3641                    vec_epilogue_cost);
3642       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3643                    scalar_single_iter_cost);
3644       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3645                    scalar_outside_cost);
3646       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3647                    vec_outside_cost);
3648       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3649                    peel_iters_prologue);
3650       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3651                    peel_iters_epilogue);
3652     }
3653
3654   /* Calculate number of iterations required to make the vector version
3655      profitable, relative to the loop bodies only.  The following condition
3656      must hold true:
3657      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3658      where
3659      SIC = scalar iteration cost, VIC = vector iteration cost,
3660      VOC = vector outside cost, VF = vectorization factor,
3661      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3662      SOC = scalar outside cost for run time cost model check.  */
3663
3664   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3665     {
3666       if (vec_outside_cost <= 0)
3667         min_profitable_iters = 0;
3668       else
3669         {
3670           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3671                                   * assumed_vf
3672                                   - vec_inside_cost * peel_iters_prologue
3673                                   - vec_inside_cost * peel_iters_epilogue)
3674                                  / ((scalar_single_iter_cost * assumed_vf)
3675                                     - vec_inside_cost);
3676
3677           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3678               <= (((int) vec_inside_cost * min_profitable_iters)
3679                   + (((int) vec_outside_cost - scalar_outside_cost)
3680                      * assumed_vf)))
3681             min_profitable_iters++;
3682         }
3683     }
3684   /* vector version will never be profitable.  */
3685   else
3686     {
3687       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3688         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3689                     "did not happen for a simd loop");
3690
3691       if (dump_enabled_p ())
3692         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3693                          "cost model: the vector iteration cost = %d "
3694                          "divided by the scalar iteration cost = %d "
3695                          "is greater or equal to the vectorization factor = %d"
3696                          ".\n",
3697                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3698       *ret_min_profitable_niters = -1;
3699       *ret_min_profitable_estimate = -1;
3700       return;
3701     }
3702
3703   dump_printf (MSG_NOTE,
3704                "  Calculated minimum iters for profitability: %d\n",
3705                min_profitable_iters);
3706
3707   /* We want the vectorized loop to execute at least once.  */
3708   if (min_profitable_iters < (assumed_vf + peel_iters_prologue))
3709     min_profitable_iters = assumed_vf + peel_iters_prologue;
3710
3711   if (dump_enabled_p ())
3712     dump_printf_loc (MSG_NOTE, vect_location,
3713                      "  Runtime profitability threshold = %d\n",
3714                      min_profitable_iters);
3715
3716   *ret_min_profitable_niters = min_profitable_iters;
3717
3718   /* Calculate number of iterations required to make the vector version
3719      profitable, relative to the loop bodies only.
3720
3721      Non-vectorized variant is SIC * niters and it must win over vector
3722      variant on the expected loop trip count.  The following condition must hold true:
3723      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3724
3725   if (vec_outside_cost <= 0)
3726     min_profitable_estimate = 0;
3727   else
3728     {
3729       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3730                                  * assumed_vf
3731                                  - vec_inside_cost * peel_iters_prologue
3732                                  - vec_inside_cost * peel_iters_epilogue)
3733                                  / ((scalar_single_iter_cost * assumed_vf)
3734                                    - vec_inside_cost);
3735     }
3736   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3737   if (dump_enabled_p ())
3738     dump_printf_loc (MSG_NOTE, vect_location,
3739                      "  Static estimate profitability threshold = %d\n",
3740                      min_profitable_estimate);
3741
3742   *ret_min_profitable_estimate = min_profitable_estimate;
3743 }
3744
3745 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3746    vector elements (not bits) for a vector with NELT elements.  */
3747 static void
3748 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3749                               vec_perm_builder *sel)
3750 {
3751   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3752      by vec_perm_indices.  */
3753   sel->new_vector (nelt, 1, 3);
3754   for (unsigned int i = 0; i < 3; i++)
3755     sel->quick_push (i + offset);
3756 }
3757
3758 /* Checks whether the target supports whole-vector shifts for vectors of mode
3759    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3760    it supports vec_perm_const with masks for all necessary shift amounts.  */
3761 static bool
3762 have_whole_vector_shift (machine_mode mode)
3763 {
3764   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3765     return true;
3766
3767   /* Variable-length vectors should be handled via the optab.  */
3768   unsigned int nelt;
3769   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3770     return false;
3771
3772   vec_perm_builder sel;
3773   vec_perm_indices indices;
3774   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3775     {
3776       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3777       indices.new_vector (sel, 2, nelt);
3778       if (!can_vec_perm_const_p (mode, indices, false))
3779         return false;
3780     }
3781   return true;
3782 }
3783
3784 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3785    functions. Design better to avoid maintenance issues.  */
3786
3787 /* Function vect_model_reduction_cost.
3788
3789    Models cost for a reduction operation, including the vector ops
3790    generated within the strip-mine loop, the initial definition before
3791    the loop, and the epilogue code that must be generated.  */
3792
3793 static void
3794 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3795                            int ncopies)
3796 {
3797   int prologue_cost = 0, epilogue_cost = 0;
3798   enum tree_code code;
3799   optab optab;
3800   tree vectype;
3801   gimple *orig_stmt;
3802   machine_mode mode;
3803   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3804   struct loop *loop = NULL;
3805   void *target_cost_data;
3806
3807   if (loop_vinfo)
3808     {
3809       loop = LOOP_VINFO_LOOP (loop_vinfo);
3810       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3811     }
3812   else
3813     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3814
3815   /* Condition reductions generate two reductions in the loop.  */
3816   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3817     ncopies *= 2;
3818
3819   /* Cost of reduction op inside loop.  */
3820   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3821                                         stmt_info, 0, vect_body);
3822
3823   vectype = STMT_VINFO_VECTYPE (stmt_info);
3824   mode = TYPE_MODE (vectype);
3825   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3826
3827   if (!orig_stmt)
3828     orig_stmt = STMT_VINFO_STMT (stmt_info);
3829
3830   code = gimple_assign_rhs_code (orig_stmt);
3831
3832   /* Add in cost for initial definition.
3833      For cond reduction we have four vectors: initial index, step, initial
3834      result of the data reduction, initial value of the index reduction.  */
3835   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3836                        == COND_REDUCTION ? 4 : 1;
3837   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3838                                   scalar_to_vec, stmt_info, 0,
3839                                   vect_prologue);
3840
3841   /* Determine cost of epilogue code.
3842
3843      We have a reduction operator that will reduce the vector in one statement.
3844      Also requires scalar extract.  */
3845
3846   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3847     {
3848       if (reduc_fn != IFN_LAST)
3849         {
3850           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3851             {
3852               /* An EQ stmt and an COND_EXPR stmt.  */
3853               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3854                                               vector_stmt, stmt_info, 0,
3855                                               vect_epilogue);
3856               /* Reduction of the max index and a reduction of the found
3857                  values.  */
3858               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3859                                               vec_to_scalar, stmt_info, 0,
3860                                               vect_epilogue);
3861               /* A broadcast of the max value.  */
3862               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3863                                               scalar_to_vec, stmt_info, 0,
3864                                               vect_epilogue);
3865             }
3866           else
3867             {
3868               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3869                                               stmt_info, 0, vect_epilogue);
3870               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3871                                               vec_to_scalar, stmt_info, 0,
3872                                               vect_epilogue);
3873             }
3874         }
3875       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3876         {
3877           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3878           /* Extraction of scalar elements.  */
3879           epilogue_cost += add_stmt_cost (target_cost_data,
3880                                           2 * estimated_nunits,
3881                                           vec_to_scalar, stmt_info, 0,
3882                                           vect_epilogue);
3883           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3884           epilogue_cost += add_stmt_cost (target_cost_data,
3885                                           2 * estimated_nunits - 3,
3886                                           scalar_stmt, stmt_info, 0,
3887                                           vect_epilogue);
3888         }
3889       else
3890         {
3891           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3892           tree bitsize =
3893             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3894           int element_bitsize = tree_to_uhwi (bitsize);
3895           int nelements = vec_size_in_bits / element_bitsize;
3896
3897           if (code == COND_EXPR)
3898             code = MAX_EXPR;
3899
3900           optab = optab_for_tree_code (code, vectype, optab_default);
3901
3902           /* We have a whole vector shift available.  */
3903           if (optab != unknown_optab
3904               && VECTOR_MODE_P (mode)
3905               && optab_handler (optab, mode) != CODE_FOR_nothing
3906               && have_whole_vector_shift (mode))
3907             {
3908               /* Final reduction via vector shifts and the reduction operator.
3909                  Also requires scalar extract.  */
3910               epilogue_cost += add_stmt_cost (target_cost_data,
3911                                               exact_log2 (nelements) * 2,
3912                                               vector_stmt, stmt_info, 0,
3913                                               vect_epilogue);
3914               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3915                                               vec_to_scalar, stmt_info, 0,
3916                                               vect_epilogue);
3917             }
3918           else
3919             /* Use extracts and reduction op for final reduction.  For N
3920                elements, we have N extracts and N-1 reduction ops.  */
3921             epilogue_cost += add_stmt_cost (target_cost_data,
3922                                             nelements + nelements - 1,
3923                                             vector_stmt, stmt_info, 0,
3924                                             vect_epilogue);
3925         }
3926     }
3927
3928   if (dump_enabled_p ())
3929     dump_printf (MSG_NOTE,
3930                  "vect_model_reduction_cost: inside_cost = %d, "
3931                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3932                  prologue_cost, epilogue_cost);
3933 }
3934
3935
3936 /* Function vect_model_induction_cost.
3937
3938    Models cost for induction operations.  */
3939
3940 static void
3941 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3942 {
3943   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3944   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3945   unsigned inside_cost, prologue_cost;
3946
3947   if (PURE_SLP_STMT (stmt_info))
3948     return;
3949
3950   /* loop cost for vec_loop.  */
3951   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3952                                stmt_info, 0, vect_body);
3953
3954   /* prologue cost for vec_init and vec_step.  */
3955   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3956                                  stmt_info, 0, vect_prologue);
3957
3958   if (dump_enabled_p ())
3959     dump_printf_loc (MSG_NOTE, vect_location,
3960                      "vect_model_induction_cost: inside_cost = %d, "
3961                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3962 }
3963
3964
3965
3966 /* Function get_initial_def_for_reduction
3967
3968    Input:
3969    STMT - a stmt that performs a reduction operation in the loop.
3970    INIT_VAL - the initial value of the reduction variable
3971
3972    Output:
3973    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3974         of the reduction (used for adjusting the epilog - see below).
3975    Return a vector variable, initialized according to the operation that STMT
3976         performs. This vector will be used as the initial value of the
3977         vector of partial results.
3978
3979    Option1 (adjust in epilog): Initialize the vector as follows:
3980      add/bit or/xor:    [0,0,...,0,0]
3981      mult/bit and:      [1,1,...,1,1]
3982      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3983    and when necessary (e.g. add/mult case) let the caller know
3984    that it needs to adjust the result by init_val.
3985
3986    Option2: Initialize the vector as follows:
3987      add/bit or/xor:    [init_val,0,0,...,0]
3988      mult/bit and:      [init_val,1,1,...,1]
3989      min/max/cond_expr: [init_val,init_val,...,init_val]
3990    and no adjustments are needed.
3991
3992    For example, for the following code:
3993
3994    s = init_val;
3995    for (i=0;i<n;i++)
3996      s = s + a[i];
3997
3998    STMT is 's = s + a[i]', and the reduction variable is 's'.
3999    For a vector of 4 units, we want to return either [0,0,0,init_val],
4000    or [0,0,0,0] and let the caller know that it needs to adjust
4001    the result at the end by 'init_val'.
4002
4003    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4004    initialization vector is simpler (same element in all entries), if
4005    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4006
4007    A cost model should help decide between these two schemes.  */
4008
4009 tree
4010 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4011                                tree *adjustment_def)
4012 {
4013   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4014   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4015   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4016   tree scalar_type = TREE_TYPE (init_val);
4017   tree vectype = get_vectype_for_scalar_type (scalar_type);
4018   enum tree_code code = gimple_assign_rhs_code (stmt);
4019   tree def_for_init;
4020   tree init_def;
4021   bool nested_in_vect_loop = false;
4022   REAL_VALUE_TYPE real_init_val = dconst0;
4023   int int_init_val = 0;
4024   gimple *def_stmt = NULL;
4025   gimple_seq stmts = NULL;
4026
4027   gcc_assert (vectype);
4028
4029   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4030               || SCALAR_FLOAT_TYPE_P (scalar_type));
4031
4032   if (nested_in_vect_loop_p (loop, stmt))
4033     nested_in_vect_loop = true;
4034   else
4035     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4036
4037   /* In case of double reduction we only create a vector variable to be put
4038      in the reduction phi node.  The actual statement creation is done in
4039      vect_create_epilog_for_reduction.  */
4040   if (adjustment_def && nested_in_vect_loop
4041       && TREE_CODE (init_val) == SSA_NAME
4042       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4043       && gimple_code (def_stmt) == GIMPLE_PHI
4044       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4045       && vinfo_for_stmt (def_stmt)
4046       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4047           == vect_double_reduction_def)
4048     {
4049       *adjustment_def = NULL;
4050       return vect_create_destination_var (init_val, vectype);
4051     }
4052
4053   /* In case of a nested reduction do not use an adjustment def as
4054      that case is not supported by the epilogue generation correctly
4055      if ncopies is not one.  */
4056   if (adjustment_def && nested_in_vect_loop)
4057     {
4058       *adjustment_def = NULL;
4059       return vect_get_vec_def_for_operand (init_val, stmt);
4060     }
4061
4062   switch (code)
4063     {
4064     case WIDEN_SUM_EXPR:
4065     case DOT_PROD_EXPR:
4066     case SAD_EXPR:
4067     case PLUS_EXPR:
4068     case MINUS_EXPR:
4069     case BIT_IOR_EXPR:
4070     case BIT_XOR_EXPR:
4071     case MULT_EXPR:
4072     case BIT_AND_EXPR:
4073       {
4074         /* ADJUSTMENT_DEF is NULL when called from
4075            vect_create_epilog_for_reduction to vectorize double reduction.  */
4076         if (adjustment_def)
4077           *adjustment_def = init_val;
4078
4079         if (code == MULT_EXPR)
4080           {
4081             real_init_val = dconst1;
4082             int_init_val = 1;
4083           }
4084
4085         if (code == BIT_AND_EXPR)
4086           int_init_val = -1;
4087
4088         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4089           def_for_init = build_real (scalar_type, real_init_val);
4090         else
4091           def_for_init = build_int_cst (scalar_type, int_init_val);
4092
4093         if (adjustment_def)
4094           /* Option1: the first element is '0' or '1' as well.  */
4095           init_def = gimple_build_vector_from_val (&stmts, vectype,
4096                                                    def_for_init);
4097         else
4098           {
4099             /* Option2: the first element is INIT_VAL.  */
4100             tree_vector_builder elts (vectype, 1, 2);
4101             elts.quick_push (init_val);
4102             elts.quick_push (def_for_init);
4103             init_def = gimple_build_vector (&stmts, &elts);
4104           }
4105       }
4106       break;
4107
4108     case MIN_EXPR:
4109     case MAX_EXPR:
4110     case COND_EXPR:
4111       {
4112         if (adjustment_def)
4113           {
4114             *adjustment_def = NULL_TREE;
4115             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4116               {
4117                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4118                 break;
4119               }
4120           }
4121         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4122         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4123       }
4124       break;
4125
4126     default:
4127       gcc_unreachable ();
4128     }
4129
4130   if (stmts)
4131     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4132   return init_def;
4133 }
4134
4135 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4136    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4137
4138 static void
4139 get_initial_defs_for_reduction (slp_tree slp_node,
4140                                 vec<tree> *vec_oprnds,
4141                                 unsigned int number_of_vectors,
4142                                 enum tree_code code, bool reduc_chain)
4143 {
4144   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4145   gimple *stmt = stmts[0];
4146   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4147   unsigned nunits;
4148   unsigned j, number_of_places_left_in_vector;
4149   tree vector_type, scalar_type;
4150   tree vop;
4151   int group_size = stmts.length ();
4152   unsigned int vec_num, i;
4153   unsigned number_of_copies = 1;
4154   vec<tree> voprnds;
4155   voprnds.create (number_of_vectors);
4156   tree neutral_op = NULL;
4157   struct loop *loop;
4158
4159   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4160   scalar_type = TREE_TYPE (vector_type);
4161   /* vectorizable_reduction has already rejected SLP reductions on
4162      variable-length vectors.  */
4163   nunits = TYPE_VECTOR_SUBPARTS (vector_type).to_constant ();
4164
4165   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4166
4167   loop = (gimple_bb (stmt))->loop_father;
4168   gcc_assert (loop);
4169   edge pe = loop_preheader_edge (loop);
4170
4171   /* op is the reduction operand of the first stmt already.  */
4172   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4173      we need either neutral operands or the original operands.  See
4174      get_initial_def_for_reduction() for details.  */
4175   switch (code)
4176     {
4177     case WIDEN_SUM_EXPR:
4178     case DOT_PROD_EXPR:
4179     case SAD_EXPR:
4180     case PLUS_EXPR:
4181     case MINUS_EXPR:
4182     case BIT_IOR_EXPR:
4183     case BIT_XOR_EXPR:
4184       neutral_op = build_zero_cst (scalar_type);
4185       break;
4186
4187     case MULT_EXPR:
4188       neutral_op = build_one_cst (scalar_type);
4189       break;
4190
4191     case BIT_AND_EXPR:
4192       neutral_op = build_all_ones_cst (scalar_type);
4193       break;
4194
4195     /* For MIN/MAX we don't have an easy neutral operand but
4196        the initial values can be used fine here.  Only for
4197        a reduction chain we have to force a neutral element.  */
4198     case MAX_EXPR:
4199     case MIN_EXPR:
4200       if (! reduc_chain)
4201         neutral_op = NULL;
4202       else
4203         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4204       break;
4205
4206     default:
4207       gcc_assert (! reduc_chain);
4208       neutral_op = NULL;
4209     }
4210
4211   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4212      created vectors. It is greater than 1 if unrolling is performed.
4213
4214      For example, we have two scalar operands, s1 and s2 (e.g., group of
4215      strided accesses of size two), while NUNITS is four (i.e., four scalars
4216      of this type can be packed in a vector).  The output vector will contain
4217      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4218      will be 2).
4219
4220      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4221      containing the operands.
4222
4223      For example, NUNITS is four as before, and the group size is 8
4224      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4225      {s5, s6, s7, s8}.  */
4226
4227   number_of_copies = nunits * number_of_vectors / group_size;
4228
4229   number_of_places_left_in_vector = nunits;
4230   tree_vector_builder elts (vector_type, nunits, 1);
4231   elts.quick_grow (nunits);
4232   for (j = 0; j < number_of_copies; j++)
4233     {
4234       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4235         {
4236           tree op;
4237           /* Get the def before the loop.  In reduction chain we have only
4238              one initial value.  */
4239           if ((j != (number_of_copies - 1)
4240                || (reduc_chain && i != 0))
4241               && neutral_op)
4242             op = neutral_op;
4243           else
4244             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4245
4246           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4247           number_of_places_left_in_vector--;
4248           elts[number_of_places_left_in_vector] = op;
4249
4250           if (number_of_places_left_in_vector == 0)
4251             {
4252               gimple_seq ctor_seq = NULL;
4253               tree init = gimple_build_vector (&ctor_seq, &elts);
4254               if (ctor_seq != NULL)
4255                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4256               voprnds.quick_push (init);
4257
4258               number_of_places_left_in_vector = nunits;
4259               elts.new_vector (vector_type, nunits, 1);
4260               elts.quick_grow (nunits);
4261             }
4262         }
4263     }
4264
4265   /* Since the vectors are created in the reverse order, we should invert
4266      them.  */
4267   vec_num = voprnds.length ();
4268   for (j = vec_num; j != 0; j--)
4269     {
4270       vop = voprnds[j - 1];
4271       vec_oprnds->quick_push (vop);
4272     }
4273
4274   voprnds.release ();
4275
4276   /* In case that VF is greater than the unrolling factor needed for the SLP
4277      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4278      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4279      to replicate the vectors.  */
4280   tree neutral_vec = NULL;
4281   while (number_of_vectors > vec_oprnds->length ())
4282     {
4283       if (neutral_op)
4284         {
4285           if (!neutral_vec)
4286             {
4287               gimple_seq ctor_seq = NULL;
4288               neutral_vec = gimple_build_vector_from_val
4289                 (&ctor_seq, vector_type, neutral_op);
4290               if (ctor_seq != NULL)
4291                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4292             }
4293           vec_oprnds->quick_push (neutral_vec);
4294         }
4295       else
4296         {
4297           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4298             vec_oprnds->quick_push (vop);
4299         }
4300     }
4301 }
4302
4303
4304 /* Function vect_create_epilog_for_reduction
4305
4306    Create code at the loop-epilog to finalize the result of a reduction
4307    computation.
4308
4309    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4310      reduction statements.
4311    STMT is the scalar reduction stmt that is being vectorized.
4312    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4313      number of elements that we can fit in a vectype (nunits).  In this case
4314      we have to generate more than one vector stmt - i.e - we need to "unroll"
4315      the vector stmt by a factor VF/nunits.  For more details see documentation
4316      in vectorizable_operation.
4317    REDUC_FN is the internal function for the epilog reduction.
4318    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4319      computation.
4320    REDUC_INDEX is the index of the operand in the right hand side of the
4321      statement that is defined by REDUCTION_PHI.
4322    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4323    SLP_NODE is an SLP node containing a group of reduction statements. The
4324      first one in this group is STMT.
4325    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4326      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4327      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4328      any value of the IV in the loop.
4329    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4330
4331    This function:
4332    1. Creates the reduction def-use cycles: sets the arguments for
4333       REDUCTION_PHIS:
4334       The loop-entry argument is the vectorized initial-value of the reduction.
4335       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4336       sums.
4337    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4338       by calling the function specified by REDUC_FN if available, or by
4339       other means (whole-vector shifts or a scalar loop).
4340       The function also creates a new phi node at the loop exit to preserve
4341       loop-closed form, as illustrated below.
4342
4343      The flow at the entry to this function:
4344
4345         loop:
4346           vec_def = phi <null, null>            # REDUCTION_PHI
4347           VECT_DEF = vector_stmt                # vectorized form of STMT
4348           s_loop = scalar_stmt                  # (scalar) STMT
4349         loop_exit:
4350           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4351           use <s_out0>
4352           use <s_out0>
4353
4354      The above is transformed by this function into:
4355
4356         loop:
4357           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4358           VECT_DEF = vector_stmt                # vectorized form of STMT
4359           s_loop = scalar_stmt                  # (scalar) STMT
4360         loop_exit:
4361           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4362           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4363           v_out2 = reduce <v_out1>
4364           s_out3 = extract_field <v_out2, 0>
4365           s_out4 = adjust_result <s_out3>
4366           use <s_out4>
4367           use <s_out4>
4368 */
4369
4370 static void
4371 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4372                                   gimple *reduc_def_stmt,
4373                                   int ncopies, internal_fn reduc_fn,
4374                                   vec<gimple *> reduction_phis,
4375                                   bool double_reduc,
4376                                   slp_tree slp_node,
4377                                   slp_instance slp_node_instance,
4378                                   tree induc_val, enum tree_code induc_code)
4379 {
4380   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4381   stmt_vec_info prev_phi_info;
4382   tree vectype;
4383   machine_mode mode;
4384   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4385   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4386   basic_block exit_bb;
4387   tree scalar_dest;
4388   tree scalar_type;
4389   gimple *new_phi = NULL, *phi;
4390   gimple_stmt_iterator exit_gsi;
4391   tree vec_dest;
4392   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4393   gimple *epilog_stmt = NULL;
4394   enum tree_code code = gimple_assign_rhs_code (stmt);
4395   gimple *exit_phi;
4396   tree bitsize;
4397   tree adjustment_def = NULL;
4398   tree vec_initial_def = NULL;
4399   tree expr, def, initial_def = NULL;
4400   tree orig_name, scalar_result;
4401   imm_use_iterator imm_iter, phi_imm_iter;
4402   use_operand_p use_p, phi_use_p;
4403   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4404   bool nested_in_vect_loop = false;
4405   auto_vec<gimple *> new_phis;
4406   auto_vec<gimple *> inner_phis;
4407   enum vect_def_type dt = vect_unknown_def_type;
4408   int j, i;
4409   auto_vec<tree> scalar_results;
4410   unsigned int group_size = 1, k, ratio;
4411   auto_vec<tree> vec_initial_defs;
4412   auto_vec<gimple *> phis;
4413   bool slp_reduc = false;
4414   tree new_phi_result;
4415   gimple *inner_phi = NULL;
4416   tree induction_index = NULL_TREE;
4417
4418   if (slp_node)
4419     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4420
4421   if (nested_in_vect_loop_p (loop, stmt))
4422     {
4423       outer_loop = loop;
4424       loop = loop->inner;
4425       nested_in_vect_loop = true;
4426       gcc_assert (!slp_node);
4427     }
4428
4429   vectype = STMT_VINFO_VECTYPE (stmt_info);
4430   gcc_assert (vectype);
4431   mode = TYPE_MODE (vectype);
4432
4433   /* 1. Create the reduction def-use cycle:
4434      Set the arguments of REDUCTION_PHIS, i.e., transform
4435
4436         loop:
4437           vec_def = phi <null, null>            # REDUCTION_PHI
4438           VECT_DEF = vector_stmt                # vectorized form of STMT
4439           ...
4440
4441      into:
4442
4443         loop:
4444           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4445           VECT_DEF = vector_stmt                # vectorized form of STMT
4446           ...
4447
4448      (in case of SLP, do it for all the phis). */
4449
4450   /* Get the loop-entry arguments.  */
4451   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4452   if (slp_node)
4453     {
4454       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4455       vec_initial_defs.reserve (vec_num);
4456       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4457                                       &vec_initial_defs, vec_num, code,
4458                                       GROUP_FIRST_ELEMENT (stmt_info));
4459     }
4460   else
4461     {
4462       /* Get at the scalar def before the loop, that defines the initial value
4463          of the reduction variable.  */
4464       gimple *def_stmt;
4465       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4466                                            loop_preheader_edge (loop));
4467       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4468          and we can't use zero for induc_val, use initial_def.  Similarly
4469          for REDUC_MIN and initial_def larger than the base.  */
4470       if (TREE_CODE (initial_def) == INTEGER_CST
4471           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4472               == INTEGER_INDUC_COND_REDUCTION)
4473           && !integer_zerop (induc_val)
4474           && ((induc_code == MAX_EXPR
4475                && tree_int_cst_lt (initial_def, induc_val))
4476               || (induc_code == MIN_EXPR
4477                   && tree_int_cst_lt (induc_val, initial_def))))
4478         induc_val = initial_def;
4479       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4480       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4481                                                        &adjustment_def);
4482       vec_initial_defs.create (1);
4483       vec_initial_defs.quick_push (vec_initial_def);
4484     }
4485
4486   /* Set phi nodes arguments.  */
4487   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4488     {
4489       tree vec_init_def = vec_initial_defs[i];
4490       tree def = vect_defs[i];
4491       for (j = 0; j < ncopies; j++)
4492         {
4493           if (j != 0)
4494             {
4495               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4496               if (nested_in_vect_loop)
4497                 vec_init_def
4498                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4499                                                     vec_init_def);
4500             }
4501
4502           /* Set the loop-entry arg of the reduction-phi.  */
4503
4504           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4505               == INTEGER_INDUC_COND_REDUCTION)
4506             {
4507               /* Initialise the reduction phi to zero.  This prevents initial
4508                  values of non-zero interferring with the reduction op.  */
4509               gcc_assert (ncopies == 1);
4510               gcc_assert (i == 0);
4511
4512               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4513               tree induc_val_vec
4514                 = build_vector_from_val (vec_init_def_type, induc_val);
4515
4516               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4517                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4518             }
4519           else
4520             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4521                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4522
4523           /* Set the loop-latch arg for the reduction-phi.  */
4524           if (j > 0)
4525             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4526
4527           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4528                        UNKNOWN_LOCATION);
4529
4530           if (dump_enabled_p ())
4531             {
4532               dump_printf_loc (MSG_NOTE, vect_location,
4533                                "transform reduction: created def-use cycle: ");
4534               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4535               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4536             }
4537         }
4538     }
4539
4540   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4541      which is updated with the current index of the loop for every match of
4542      the original loop's cond_expr (VEC_STMT).  This results in a vector
4543      containing the last time the condition passed for that vector lane.
4544      The first match will be a 1 to allow 0 to be used for non-matching
4545      indexes.  If there are no matches at all then the vector will be all
4546      zeroes.  */
4547   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4548     {
4549       tree indx_before_incr, indx_after_incr;
4550       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4551
4552       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4553       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4554
4555       int scalar_precision
4556         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4557       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4558       tree cr_index_vector_type = build_vector_type
4559         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4560
4561       /* First we create a simple vector induction variable which starts
4562          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4563          vector size (STEP).  */
4564
4565       /* Create a {1,2,3,...} vector.  */
4566       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4567
4568       /* Create a vector of the step value.  */
4569       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4570       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4571
4572       /* Create an induction variable.  */
4573       gimple_stmt_iterator incr_gsi;
4574       bool insert_after;
4575       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4576       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4577                  insert_after, &indx_before_incr, &indx_after_incr);
4578
4579       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4580          filled with zeros (VEC_ZERO).  */
4581
4582       /* Create a vector of 0s.  */
4583       tree zero = build_zero_cst (cr_index_scalar_type);
4584       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4585
4586       /* Create a vector phi node.  */
4587       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4588       new_phi = create_phi_node (new_phi_tree, loop->header);
4589       set_vinfo_for_stmt (new_phi,
4590                           new_stmt_vec_info (new_phi, loop_vinfo));
4591       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4592                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4593
4594       /* Now take the condition from the loops original cond_expr
4595          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4596          every match uses values from the induction variable
4597          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4598          (NEW_PHI_TREE).
4599          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4600          the new cond_expr (INDEX_COND_EXPR).  */
4601
4602       /* Duplicate the condition from vec_stmt.  */
4603       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4604
4605       /* Create a conditional, where the condition is taken from vec_stmt
4606          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4607          else is the phi (NEW_PHI_TREE).  */
4608       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4609                                      ccompare, indx_before_incr,
4610                                      new_phi_tree);
4611       induction_index = make_ssa_name (cr_index_vector_type);
4612       gimple *index_condition = gimple_build_assign (induction_index,
4613                                                      index_cond_expr);
4614       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4615       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4616                                                         loop_vinfo);
4617       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4618       set_vinfo_for_stmt (index_condition, index_vec_info);
4619
4620       /* Update the phi with the vec cond.  */
4621       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4622                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4623     }
4624
4625   /* 2. Create epilog code.
4626         The reduction epilog code operates across the elements of the vector
4627         of partial results computed by the vectorized loop.
4628         The reduction epilog code consists of:
4629
4630         step 1: compute the scalar result in a vector (v_out2)
4631         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4632         step 3: adjust the scalar result (s_out3) if needed.
4633
4634         Step 1 can be accomplished using one the following three schemes:
4635           (scheme 1) using reduc_fn, if available.
4636           (scheme 2) using whole-vector shifts, if available.
4637           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4638                      combined.
4639
4640           The overall epilog code looks like this:
4641
4642           s_out0 = phi <s_loop>         # original EXIT_PHI
4643           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4644           v_out2 = reduce <v_out1>              # step 1
4645           s_out3 = extract_field <v_out2, 0>    # step 2
4646           s_out4 = adjust_result <s_out3>       # step 3
4647
4648           (step 3 is optional, and steps 1 and 2 may be combined).
4649           Lastly, the uses of s_out0 are replaced by s_out4.  */
4650
4651
4652   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4653          v_out1 = phi <VECT_DEF>
4654          Store them in NEW_PHIS.  */
4655
4656   exit_bb = single_exit (loop)->dest;
4657   prev_phi_info = NULL;
4658   new_phis.create (vect_defs.length ());
4659   FOR_EACH_VEC_ELT (vect_defs, i, def)
4660     {
4661       for (j = 0; j < ncopies; j++)
4662         {
4663           tree new_def = copy_ssa_name (def);
4664           phi = create_phi_node (new_def, exit_bb);
4665           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4666           if (j == 0)
4667             new_phis.quick_push (phi);
4668           else
4669             {
4670               def = vect_get_vec_def_for_stmt_copy (dt, def);
4671               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4672             }
4673
4674           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4675           prev_phi_info = vinfo_for_stmt (phi);
4676         }
4677     }
4678
4679   /* The epilogue is created for the outer-loop, i.e., for the loop being
4680      vectorized.  Create exit phis for the outer loop.  */
4681   if (double_reduc)
4682     {
4683       loop = outer_loop;
4684       exit_bb = single_exit (loop)->dest;
4685       inner_phis.create (vect_defs.length ());
4686       FOR_EACH_VEC_ELT (new_phis, i, phi)
4687         {
4688           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4689           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4690           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4691                            PHI_RESULT (phi));
4692           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4693                                                             loop_vinfo));
4694           inner_phis.quick_push (phi);
4695           new_phis[i] = outer_phi;
4696           prev_phi_info = vinfo_for_stmt (outer_phi);
4697           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4698             {
4699               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4700               new_result = copy_ssa_name (PHI_RESULT (phi));
4701               outer_phi = create_phi_node (new_result, exit_bb);
4702               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4703                                PHI_RESULT (phi));
4704               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4705                                                                 loop_vinfo));
4706               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4707               prev_phi_info = vinfo_for_stmt (outer_phi);
4708             }
4709         }
4710     }
4711
4712   exit_gsi = gsi_after_labels (exit_bb);
4713
4714   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4715          (i.e. when reduc_fn is not available) and in the final adjustment
4716          code (if needed).  Also get the original scalar reduction variable as
4717          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4718          represents a reduction pattern), the tree-code and scalar-def are
4719          taken from the original stmt that the pattern-stmt (STMT) replaces.
4720          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4721          are taken from STMT.  */
4722
4723   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4724   if (!orig_stmt)
4725     {
4726       /* Regular reduction  */
4727       orig_stmt = stmt;
4728     }
4729   else
4730     {
4731       /* Reduction pattern  */
4732       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4733       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4734       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4735     }
4736
4737   code = gimple_assign_rhs_code (orig_stmt);
4738   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4739      partial results are added and not subtracted.  */
4740   if (code == MINUS_EXPR)
4741     code = PLUS_EXPR;
4742
4743   scalar_dest = gimple_assign_lhs (orig_stmt);
4744   scalar_type = TREE_TYPE (scalar_dest);
4745   scalar_results.create (group_size);
4746   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4747   bitsize = TYPE_SIZE (scalar_type);
4748
4749   /* In case this is a reduction in an inner-loop while vectorizing an outer
4750      loop - we don't need to extract a single scalar result at the end of the
4751      inner-loop (unless it is double reduction, i.e., the use of reduction is
4752      outside the outer-loop).  The final vector of partial results will be used
4753      in the vectorized outer-loop, or reduced to a scalar result at the end of
4754      the outer-loop.  */
4755   if (nested_in_vect_loop && !double_reduc)
4756     goto vect_finalize_reduction;
4757
4758   /* SLP reduction without reduction chain, e.g.,
4759      # a1 = phi <a2, a0>
4760      # b1 = phi <b2, b0>
4761      a2 = operation (a1)
4762      b2 = operation (b1)  */
4763   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4764
4765   /* In case of reduction chain, e.g.,
4766      # a1 = phi <a3, a0>
4767      a2 = operation (a1)
4768      a3 = operation (a2),
4769
4770      we may end up with more than one vector result.  Here we reduce them to
4771      one vector.  */
4772   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4773     {
4774       tree first_vect = PHI_RESULT (new_phis[0]);
4775       gassign *new_vec_stmt = NULL;
4776       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4777       for (k = 1; k < new_phis.length (); k++)
4778         {
4779           gimple *next_phi = new_phis[k];
4780           tree second_vect = PHI_RESULT (next_phi);
4781           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4782           new_vec_stmt = gimple_build_assign (tem, code,
4783                                               first_vect, second_vect);
4784           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4785           first_vect = tem;
4786         }
4787
4788       new_phi_result = first_vect;
4789       if (new_vec_stmt)
4790         {
4791           new_phis.truncate (0);
4792           new_phis.safe_push (new_vec_stmt);
4793         }
4794     }
4795   /* Likewise if we couldn't use a single defuse cycle.  */
4796   else if (ncopies > 1)
4797     {
4798       gcc_assert (new_phis.length () == 1);
4799       tree first_vect = PHI_RESULT (new_phis[0]);
4800       gassign *new_vec_stmt = NULL;
4801       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4802       gimple *next_phi = new_phis[0];
4803       for (int k = 1; k < ncopies; ++k)
4804         {
4805           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4806           tree second_vect = PHI_RESULT (next_phi);
4807           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4808           new_vec_stmt = gimple_build_assign (tem, code,
4809                                               first_vect, second_vect);
4810           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4811           first_vect = tem;
4812         }
4813       new_phi_result = first_vect;
4814       new_phis.truncate (0);
4815       new_phis.safe_push (new_vec_stmt);
4816     }
4817   else
4818     new_phi_result = PHI_RESULT (new_phis[0]);
4819
4820   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4821       && reduc_fn != IFN_LAST)
4822     {
4823       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4824          various data values where the condition matched and another vector
4825          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4826          need to extract the last matching index (which will be the index with
4827          highest value) and use this to index into the data vector.
4828          For the case where there were no matches, the data vector will contain
4829          all default values and the index vector will be all zeros.  */
4830
4831       /* Get various versions of the type of the vector of indexes.  */
4832       tree index_vec_type = TREE_TYPE (induction_index);
4833       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4834       tree index_scalar_type = TREE_TYPE (index_vec_type);
4835       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4836         (index_vec_type);
4837
4838       /* Get an unsigned integer version of the type of the data vector.  */
4839       int scalar_precision
4840         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4841       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4842       tree vectype_unsigned = build_vector_type
4843         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4844
4845       /* First we need to create a vector (ZERO_VEC) of zeros and another
4846          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4847          can create using a MAX reduction and then expanding.
4848          In the case where the loop never made any matches, the max index will
4849          be zero.  */
4850
4851       /* Vector of {0, 0, 0,...}.  */
4852       tree zero_vec = make_ssa_name (vectype);
4853       tree zero_vec_rhs = build_zero_cst (vectype);
4854       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4855       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4856
4857       /* Find maximum value from the vector of found indexes.  */
4858       tree max_index = make_ssa_name (index_scalar_type);
4859       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4860                                                           1, induction_index);
4861       gimple_call_set_lhs (max_index_stmt, max_index);
4862       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4863
4864       /* Vector of {max_index, max_index, max_index,...}.  */
4865       tree max_index_vec = make_ssa_name (index_vec_type);
4866       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4867                                                       max_index);
4868       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4869                                                         max_index_vec_rhs);
4870       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4871
4872       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4873          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4874          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4875          otherwise.  Only one value should match, resulting in a vector
4876          (VEC_COND) with one data value and the rest zeros.
4877          In the case where the loop never made any matches, every index will
4878          match, resulting in a vector with all data values (which will all be
4879          the default value).  */
4880
4881       /* Compare the max index vector to the vector of found indexes to find
4882          the position of the max value.  */
4883       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4884       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4885                                                       induction_index,
4886                                                       max_index_vec);
4887       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4888
4889       /* Use the compare to choose either values from the data vector or
4890          zero.  */
4891       tree vec_cond = make_ssa_name (vectype);
4892       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4893                                                    vec_compare, new_phi_result,
4894                                                    zero_vec);
4895       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4896
4897       /* Finally we need to extract the data value from the vector (VEC_COND)
4898          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4899          reduction, but because this doesn't exist, we can use a MAX reduction
4900          instead.  The data value might be signed or a float so we need to cast
4901          it first.
4902          In the case where the loop never made any matches, the data values are
4903          all identical, and so will reduce down correctly.  */
4904
4905       /* Make the matched data values unsigned.  */
4906       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4907       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4908                                        vec_cond);
4909       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4910                                                         VIEW_CONVERT_EXPR,
4911                                                         vec_cond_cast_rhs);
4912       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4913
4914       /* Reduce down to a scalar value.  */
4915       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4916       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4917                                                            1, vec_cond_cast);
4918       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4919       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4920
4921       /* Convert the reduced value back to the result type and set as the
4922          result.  */
4923       gimple_seq stmts = NULL;
4924       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4925                                data_reduc);
4926       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4927       scalar_results.safe_push (new_temp);
4928     }
4929   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4930            && reduc_fn == IFN_LAST)
4931     {
4932       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4933          idx = 0;
4934          idx_val = induction_index[0];
4935          val = data_reduc[0];
4936          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4937            if (induction_index[i] > idx_val)
4938              val = data_reduc[i], idx_val = induction_index[i];
4939          return val;  */
4940
4941       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4942       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4943       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4944       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4945       /* Enforced by vectorizable_reduction, which ensures we have target
4946          support before allowing a conditional reduction on variable-length
4947          vectors.  */
4948       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4949       tree idx_val = NULL_TREE, val = NULL_TREE;
4950       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4951         {
4952           tree old_idx_val = idx_val;
4953           tree old_val = val;
4954           idx_val = make_ssa_name (idx_eltype);
4955           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4956                                              build3 (BIT_FIELD_REF, idx_eltype,
4957                                                      induction_index,
4958                                                      bitsize_int (el_size),
4959                                                      bitsize_int (off)));
4960           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4961           val = make_ssa_name (data_eltype);
4962           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4963                                              build3 (BIT_FIELD_REF,
4964                                                      data_eltype,
4965                                                      new_phi_result,
4966                                                      bitsize_int (el_size),
4967                                                      bitsize_int (off)));
4968           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4969           if (off != 0)
4970             {
4971               tree new_idx_val = idx_val;
4972               tree new_val = val;
4973               if (off != v_size - el_size)
4974                 {
4975                   new_idx_val = make_ssa_name (idx_eltype);
4976                   epilog_stmt = gimple_build_assign (new_idx_val,
4977                                                      MAX_EXPR, idx_val,
4978                                                      old_idx_val);
4979                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4980                 }
4981               new_val = make_ssa_name (data_eltype);
4982               epilog_stmt = gimple_build_assign (new_val,
4983                                                  COND_EXPR,
4984                                                  build2 (GT_EXPR,
4985                                                          boolean_type_node,
4986                                                          idx_val,
4987                                                          old_idx_val),
4988                                                  val, old_val);
4989               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4990               idx_val = new_idx_val;
4991               val = new_val;
4992             }
4993         }
4994       /* Convert the reduced value back to the result type and set as the
4995          result.  */
4996       gimple_seq stmts = NULL;
4997       val = gimple_convert (&stmts, scalar_type, val);
4998       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4999       scalar_results.safe_push (val);
5000     }
5001
5002   /* 2.3 Create the reduction code, using one of the three schemes described
5003          above. In SLP we simply need to extract all the elements from the
5004          vector (without reducing them), so we use scalar shifts.  */
5005   else if (reduc_fn != IFN_LAST && !slp_reduc)
5006     {
5007       tree tmp;
5008       tree vec_elem_type;
5009
5010       /* Case 1:  Create:
5011          v_out2 = reduc_expr <v_out1>  */
5012
5013       if (dump_enabled_p ())
5014         dump_printf_loc (MSG_NOTE, vect_location,
5015                          "Reduce using direct vector reduction.\n");
5016
5017       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5018       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5019         {
5020           tree tmp_dest
5021             = vect_create_destination_var (scalar_dest, vec_elem_type);
5022           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5023                                                     new_phi_result);
5024           gimple_set_lhs (epilog_stmt, tmp_dest);
5025           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5026           gimple_set_lhs (epilog_stmt, new_temp);
5027           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5028
5029           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5030                                              new_temp);
5031         }
5032       else
5033         {
5034           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5035                                                     new_phi_result);
5036           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5037         }
5038
5039       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5040       gimple_set_lhs (epilog_stmt, new_temp);
5041       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5042
5043       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5044            == INTEGER_INDUC_COND_REDUCTION)
5045           && !operand_equal_p (initial_def, induc_val, 0))
5046         {
5047           /* Earlier we set the initial value to be a vector if induc_val
5048              values.  Check the result and if it is induc_val then replace
5049              with the original initial value, unless induc_val is
5050              the same as initial_def already.  */
5051           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5052                                   induc_val);
5053
5054           tmp = make_ssa_name (new_scalar_dest);
5055           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5056                                              initial_def, new_temp);
5057           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5058           new_temp = tmp;
5059         }
5060
5061       scalar_results.safe_push (new_temp);
5062     }
5063   else
5064     {
5065       bool reduce_with_shift = have_whole_vector_shift (mode);
5066       int element_bitsize = tree_to_uhwi (bitsize);
5067       /* Enforced by vectorizable_reduction, which disallows SLP reductions
5068          for variable-length vectors and also requires direct target support
5069          for loop reductions.  */
5070       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5071       tree vec_temp;
5072
5073       /* COND reductions all do the final reduction with MAX_EXPR
5074          or MIN_EXPR.  */
5075       if (code == COND_EXPR)
5076         {
5077           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5078               == INTEGER_INDUC_COND_REDUCTION)
5079             code = induc_code;
5080           else
5081             code = MAX_EXPR;
5082         }
5083
5084       /* Regardless of whether we have a whole vector shift, if we're
5085          emulating the operation via tree-vect-generic, we don't want
5086          to use it.  Only the first round of the reduction is likely
5087          to still be profitable via emulation.  */
5088       /* ??? It might be better to emit a reduction tree code here, so that
5089          tree-vect-generic can expand the first round via bit tricks.  */
5090       if (!VECTOR_MODE_P (mode))
5091         reduce_with_shift = false;
5092       else
5093         {
5094           optab optab = optab_for_tree_code (code, vectype, optab_default);
5095           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5096             reduce_with_shift = false;
5097         }
5098
5099       if (reduce_with_shift && !slp_reduc)
5100         {
5101           int nelements = vec_size_in_bits / element_bitsize;
5102           vec_perm_builder sel;
5103           vec_perm_indices indices;
5104
5105           int elt_offset;
5106
5107           tree zero_vec = build_zero_cst (vectype);
5108           /* Case 2: Create:
5109              for (offset = nelements/2; offset >= 1; offset/=2)
5110                 {
5111                   Create:  va' = vec_shift <va, offset>
5112                   Create:  va = vop <va, va'>
5113                 }  */
5114
5115           tree rhs;
5116
5117           if (dump_enabled_p ())
5118             dump_printf_loc (MSG_NOTE, vect_location,
5119                              "Reduce using vector shifts\n");
5120
5121           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5122           new_temp = new_phi_result;
5123           for (elt_offset = nelements / 2;
5124                elt_offset >= 1;
5125                elt_offset /= 2)
5126             {
5127               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5128               indices.new_vector (sel, 2, nelements);
5129               tree mask = vect_gen_perm_mask_any (vectype, indices);
5130               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5131                                                  new_temp, zero_vec, mask);
5132               new_name = make_ssa_name (vec_dest, epilog_stmt);
5133               gimple_assign_set_lhs (epilog_stmt, new_name);
5134               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5135
5136               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5137                                                  new_temp);
5138               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5139               gimple_assign_set_lhs (epilog_stmt, new_temp);
5140               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5141             }
5142
5143           /* 2.4  Extract the final scalar result.  Create:
5144              s_out3 = extract_field <v_out2, bitpos>  */
5145
5146           if (dump_enabled_p ())
5147             dump_printf_loc (MSG_NOTE, vect_location,
5148                              "extract scalar result\n");
5149
5150           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5151                         bitsize, bitsize_zero_node);
5152           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5153           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5154           gimple_assign_set_lhs (epilog_stmt, new_temp);
5155           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5156           scalar_results.safe_push (new_temp);
5157         }
5158       else
5159         {
5160           /* Case 3: Create:
5161              s = extract_field <v_out2, 0>
5162              for (offset = element_size;
5163                   offset < vector_size;
5164                   offset += element_size;)
5165                {
5166                  Create:  s' = extract_field <v_out2, offset>
5167                  Create:  s = op <s, s'>  // For non SLP cases
5168                }  */
5169
5170           if (dump_enabled_p ())
5171             dump_printf_loc (MSG_NOTE, vect_location,
5172                              "Reduce using scalar code.\n");
5173
5174           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5175           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5176             {
5177               int bit_offset;
5178               if (gimple_code (new_phi) == GIMPLE_PHI)
5179                 vec_temp = PHI_RESULT (new_phi);
5180               else
5181                 vec_temp = gimple_assign_lhs (new_phi);
5182               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5183                                  bitsize_zero_node);
5184               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5185               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5186               gimple_assign_set_lhs (epilog_stmt, new_temp);
5187               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5188
5189               /* In SLP we don't need to apply reduction operation, so we just
5190                  collect s' values in SCALAR_RESULTS.  */
5191               if (slp_reduc)
5192                 scalar_results.safe_push (new_temp);
5193
5194               for (bit_offset = element_bitsize;
5195                    bit_offset < vec_size_in_bits;
5196                    bit_offset += element_bitsize)
5197                 {
5198                   tree bitpos = bitsize_int (bit_offset);
5199                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5200                                      bitsize, bitpos);
5201
5202                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5203                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5204                   gimple_assign_set_lhs (epilog_stmt, new_name);
5205                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5206
5207                   if (slp_reduc)
5208                     {
5209                       /* In SLP we don't need to apply reduction operation, so
5210                          we just collect s' values in SCALAR_RESULTS.  */
5211                       new_temp = new_name;
5212                       scalar_results.safe_push (new_name);
5213                     }
5214                   else
5215                     {
5216                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5217                                                          new_name, new_temp);
5218                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5219                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5220                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5221                     }
5222                 }
5223             }
5224
5225           /* The only case where we need to reduce scalar results in SLP, is
5226              unrolling.  If the size of SCALAR_RESULTS is greater than
5227              GROUP_SIZE, we reduce them combining elements modulo
5228              GROUP_SIZE.  */
5229           if (slp_reduc)
5230             {
5231               tree res, first_res, new_res;
5232               gimple *new_stmt;
5233
5234               /* Reduce multiple scalar results in case of SLP unrolling.  */
5235               for (j = group_size; scalar_results.iterate (j, &res);
5236                    j++)
5237                 {
5238                   first_res = scalar_results[j % group_size];
5239                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5240                                                   first_res, res);
5241                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5242                   gimple_assign_set_lhs (new_stmt, new_res);
5243                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5244                   scalar_results[j % group_size] = new_res;
5245                 }
5246             }
5247           else
5248             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5249             scalar_results.safe_push (new_temp);
5250         }
5251
5252       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5253            == INTEGER_INDUC_COND_REDUCTION)
5254           && !operand_equal_p (initial_def, induc_val, 0))
5255         {
5256           /* Earlier we set the initial value to be a vector if induc_val
5257              values.  Check the result and if it is induc_val then replace
5258              with the original initial value, unless induc_val is
5259              the same as initial_def already.  */
5260           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5261                                   induc_val);
5262
5263           tree tmp = make_ssa_name (new_scalar_dest);
5264           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5265                                              initial_def, new_temp);
5266           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5267           scalar_results[0] = tmp;
5268         }
5269     }
5270
5271 vect_finalize_reduction:
5272
5273   if (double_reduc)
5274     loop = loop->inner;
5275
5276   /* 2.5 Adjust the final result by the initial value of the reduction
5277          variable. (When such adjustment is not needed, then
5278          'adjustment_def' is zero).  For example, if code is PLUS we create:
5279          new_temp = loop_exit_def + adjustment_def  */
5280
5281   if (adjustment_def)
5282     {
5283       gcc_assert (!slp_reduc);
5284       if (nested_in_vect_loop)
5285         {
5286           new_phi = new_phis[0];
5287           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5288           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5289           new_dest = vect_create_destination_var (scalar_dest, vectype);
5290         }
5291       else
5292         {
5293           new_temp = scalar_results[0];
5294           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5295           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5296           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5297         }
5298
5299       epilog_stmt = gimple_build_assign (new_dest, expr);
5300       new_temp = make_ssa_name (new_dest, epilog_stmt);
5301       gimple_assign_set_lhs (epilog_stmt, new_temp);
5302       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5303       if (nested_in_vect_loop)
5304         {
5305           set_vinfo_for_stmt (epilog_stmt,
5306                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5307           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5308                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5309
5310           if (!double_reduc)
5311             scalar_results.quick_push (new_temp);
5312           else
5313             scalar_results[0] = new_temp;
5314         }
5315       else
5316         scalar_results[0] = new_temp;
5317
5318       new_phis[0] = epilog_stmt;
5319     }
5320
5321   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5322           phis with new adjusted scalar results, i.e., replace use <s_out0>
5323           with use <s_out4>.
5324
5325      Transform:
5326         loop_exit:
5327           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5328           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5329           v_out2 = reduce <v_out1>
5330           s_out3 = extract_field <v_out2, 0>
5331           s_out4 = adjust_result <s_out3>
5332           use <s_out0>
5333           use <s_out0>
5334
5335      into:
5336
5337         loop_exit:
5338           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5339           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5340           v_out2 = reduce <v_out1>
5341           s_out3 = extract_field <v_out2, 0>
5342           s_out4 = adjust_result <s_out3>
5343           use <s_out4>
5344           use <s_out4> */
5345
5346
5347   /* In SLP reduction chain we reduce vector results into one vector if
5348      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5349      the last stmt in the reduction chain, since we are looking for the loop
5350      exit phi node.  */
5351   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5352     {
5353       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5354       /* Handle reduction patterns.  */
5355       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5356         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5357
5358       scalar_dest = gimple_assign_lhs (dest_stmt);
5359       group_size = 1;
5360     }
5361
5362   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5363      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5364      need to match SCALAR_RESULTS with corresponding statements.  The first
5365      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5366      the first vector stmt, etc.
5367      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5368   if (group_size > new_phis.length ())
5369     {
5370       ratio = group_size / new_phis.length ();
5371       gcc_assert (!(group_size % new_phis.length ()));
5372     }
5373   else
5374     ratio = 1;
5375
5376   for (k = 0; k < group_size; k++)
5377     {
5378       if (k % ratio == 0)
5379         {
5380           epilog_stmt = new_phis[k / ratio];
5381           reduction_phi = reduction_phis[k / ratio];
5382           if (double_reduc)
5383             inner_phi = inner_phis[k / ratio];
5384         }
5385
5386       if (slp_reduc)
5387         {
5388           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5389
5390           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5391           /* SLP statements can't participate in patterns.  */
5392           gcc_assert (!orig_stmt);
5393           scalar_dest = gimple_assign_lhs (current_stmt);
5394         }
5395
5396       phis.create (3);
5397       /* Find the loop-closed-use at the loop exit of the original scalar
5398          result.  (The reduction result is expected to have two immediate uses -
5399          one at the latch block, and one at the loop exit).  */
5400       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5401         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5402             && !is_gimple_debug (USE_STMT (use_p)))
5403           phis.safe_push (USE_STMT (use_p));
5404
5405       /* While we expect to have found an exit_phi because of loop-closed-ssa
5406          form we can end up without one if the scalar cycle is dead.  */
5407
5408       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5409         {
5410           if (outer_loop)
5411             {
5412               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5413               gphi *vect_phi;
5414
5415               /* FORNOW. Currently not supporting the case that an inner-loop
5416                  reduction is not used in the outer-loop (but only outside the
5417                  outer-loop), unless it is double reduction.  */
5418               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5419                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5420                           || double_reduc);
5421
5422               if (double_reduc)
5423                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5424               else
5425                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5426               if (!double_reduc
5427                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5428                       != vect_double_reduction_def)
5429                 continue;
5430
5431               /* Handle double reduction:
5432
5433                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5434                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5435                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5436                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5437
5438                  At that point the regular reduction (stmt2 and stmt3) is
5439                  already vectorized, as well as the exit phi node, stmt4.
5440                  Here we vectorize the phi node of double reduction, stmt1, and
5441                  update all relevant statements.  */
5442
5443               /* Go through all the uses of s2 to find double reduction phi
5444                  node, i.e., stmt1 above.  */
5445               orig_name = PHI_RESULT (exit_phi);
5446               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5447                 {
5448                   stmt_vec_info use_stmt_vinfo;
5449                   stmt_vec_info new_phi_vinfo;
5450                   tree vect_phi_init, preheader_arg, vect_phi_res;
5451                   basic_block bb = gimple_bb (use_stmt);
5452                   gimple *use;
5453
5454                   /* Check that USE_STMT is really double reduction phi
5455                      node.  */
5456                   if (gimple_code (use_stmt) != GIMPLE_PHI
5457                       || gimple_phi_num_args (use_stmt) != 2
5458                       || bb->loop_father != outer_loop)
5459                     continue;
5460                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5461                   if (!use_stmt_vinfo
5462                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5463                           != vect_double_reduction_def)
5464                     continue;
5465
5466                   /* Create vector phi node for double reduction:
5467                      vs1 = phi <vs0, vs2>
5468                      vs1 was created previously in this function by a call to
5469                        vect_get_vec_def_for_operand and is stored in
5470                        vec_initial_def;
5471                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5472                      vs0 is created here.  */
5473
5474                   /* Create vector phi node.  */
5475                   vect_phi = create_phi_node (vec_initial_def, bb);
5476                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5477                                     loop_vec_info_for_loop (outer_loop));
5478                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5479
5480                   /* Create vs0 - initial def of the double reduction phi.  */
5481                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5482                                              loop_preheader_edge (outer_loop));
5483                   vect_phi_init = get_initial_def_for_reduction
5484                     (stmt, preheader_arg, NULL);
5485
5486                   /* Update phi node arguments with vs0 and vs2.  */
5487                   add_phi_arg (vect_phi, vect_phi_init,
5488                                loop_preheader_edge (outer_loop),
5489                                UNKNOWN_LOCATION);
5490                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5491                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5492                   if (dump_enabled_p ())
5493                     {
5494                       dump_printf_loc (MSG_NOTE, vect_location,
5495                                        "created double reduction phi node: ");
5496                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5497                     }
5498
5499                   vect_phi_res = PHI_RESULT (vect_phi);
5500
5501                   /* Replace the use, i.e., set the correct vs1 in the regular
5502                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5503                      loop is redundant.  */
5504                   use = reduction_phi;
5505                   for (j = 0; j < ncopies; j++)
5506                     {
5507                       edge pr_edge = loop_preheader_edge (loop);
5508                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5509                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5510                     }
5511                 }
5512             }
5513         }
5514
5515       phis.release ();
5516       if (nested_in_vect_loop)
5517         {
5518           if (double_reduc)
5519             loop = outer_loop;
5520           else
5521             continue;
5522         }
5523
5524       phis.create (3);
5525       /* Find the loop-closed-use at the loop exit of the original scalar
5526          result.  (The reduction result is expected to have two immediate uses,
5527          one at the latch block, and one at the loop exit).  For double
5528          reductions we are looking for exit phis of the outer loop.  */
5529       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5530         {
5531           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5532             {
5533               if (!is_gimple_debug (USE_STMT (use_p)))
5534                 phis.safe_push (USE_STMT (use_p));
5535             }
5536           else
5537             {
5538               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5539                 {
5540                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5541
5542                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5543                     {
5544                       if (!flow_bb_inside_loop_p (loop,
5545                                              gimple_bb (USE_STMT (phi_use_p)))
5546                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5547                         phis.safe_push (USE_STMT (phi_use_p));
5548                     }
5549                 }
5550             }
5551         }
5552
5553       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5554         {
5555           /* Replace the uses:  */
5556           orig_name = PHI_RESULT (exit_phi);
5557           scalar_result = scalar_results[k];
5558           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5559             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5560               SET_USE (use_p, scalar_result);
5561         }
5562
5563       phis.release ();
5564     }
5565 }
5566
5567
5568 /* Function is_nonwrapping_integer_induction.
5569
5570    Check if STMT (which is part of loop LOOP) both increments and
5571    does not cause overflow.  */
5572
5573 static bool
5574 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5575 {
5576   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5577   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5578   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5579   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5580   widest_int ni, max_loop_value, lhs_max;
5581   bool overflow = false;
5582
5583   /* Make sure the loop is integer based.  */
5584   if (TREE_CODE (base) != INTEGER_CST
5585       || TREE_CODE (step) != INTEGER_CST)
5586     return false;
5587
5588   /* Check that the max size of the loop will not wrap.  */
5589
5590   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5591     return true;
5592
5593   if (! max_stmt_executions (loop, &ni))
5594     return false;
5595
5596   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5597                             &overflow);
5598   if (overflow)
5599     return false;
5600
5601   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5602                             TYPE_SIGN (lhs_type), &overflow);
5603   if (overflow)
5604     return false;
5605
5606   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5607           <= TYPE_PRECISION (lhs_type));
5608 }
5609
5610 /* Function vectorizable_reduction.
5611
5612    Check if STMT performs a reduction operation that can be vectorized.
5613    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5614    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5615    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5616
5617    This function also handles reduction idioms (patterns) that have been
5618    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5619    of this form:
5620      X = pattern_expr (arg0, arg1, ..., X)
5621    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5622    sequence that had been detected and replaced by the pattern-stmt (STMT).
5623
5624    This function also handles reduction of condition expressions, for example:
5625      for (int i = 0; i < N; i++)
5626        if (a[i] < value)
5627          last = a[i];
5628    This is handled by vectorising the loop and creating an additional vector
5629    containing the loop indexes for which "a[i] < value" was true.  In the
5630    function epilogue this is reduced to a single max value and then used to
5631    index into the vector of results.
5632
5633    In some cases of reduction patterns, the type of the reduction variable X is
5634    different than the type of the other arguments of STMT.
5635    In such cases, the vectype that is used when transforming STMT into a vector
5636    stmt is different than the vectype that is used to determine the
5637    vectorization factor, because it consists of a different number of elements
5638    than the actual number of elements that are being operated upon in parallel.
5639
5640    For example, consider an accumulation of shorts into an int accumulator.
5641    On some targets it's possible to vectorize this pattern operating on 8
5642    shorts at a time (hence, the vectype for purposes of determining the
5643    vectorization factor should be V8HI); on the other hand, the vectype that
5644    is used to create the vector form is actually V4SI (the type of the result).
5645
5646    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5647    indicates what is the actual level of parallelism (V8HI in the example), so
5648    that the right vectorization factor would be derived.  This vectype
5649    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5650    be used to create the vectorized stmt.  The right vectype for the vectorized
5651    stmt is obtained from the type of the result X:
5652         get_vectype_for_scalar_type (TREE_TYPE (X))
5653
5654    This means that, contrary to "regular" reductions (or "regular" stmts in
5655    general), the following equation:
5656       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5657    does *NOT* necessarily hold for reduction patterns.  */
5658
5659 bool
5660 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5661                         gimple **vec_stmt, slp_tree slp_node,
5662                         slp_instance slp_node_instance)
5663 {
5664   tree vec_dest;
5665   tree scalar_dest;
5666   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5667   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5668   tree vectype_in = NULL_TREE;
5669   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5670   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5671   enum tree_code code, orig_code;
5672   internal_fn reduc_fn;
5673   machine_mode vec_mode;
5674   int op_type;
5675   optab optab;
5676   tree new_temp = NULL_TREE;
5677   gimple *def_stmt;
5678   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5679   gimple *cond_reduc_def_stmt = NULL;
5680   enum tree_code cond_reduc_op_code = ERROR_MARK;
5681   tree scalar_type;
5682   bool is_simple_use;
5683   gimple *orig_stmt;
5684   stmt_vec_info orig_stmt_info = NULL;
5685   int i;
5686   int ncopies;
5687   int epilog_copies;
5688   stmt_vec_info prev_stmt_info, prev_phi_info;
5689   bool single_defuse_cycle = false;
5690   gimple *new_stmt = NULL;
5691   int j;
5692   tree ops[3];
5693   enum vect_def_type dts[3];
5694   bool nested_cycle = false, found_nested_cycle_def = false;
5695   bool double_reduc = false;
5696   basic_block def_bb;
5697   struct loop * def_stmt_loop, *outer_loop = NULL;
5698   tree def_arg;
5699   gimple *def_arg_stmt;
5700   auto_vec<tree> vec_oprnds0;
5701   auto_vec<tree> vec_oprnds1;
5702   auto_vec<tree> vec_oprnds2;
5703   auto_vec<tree> vect_defs;
5704   auto_vec<gimple *> phis;
5705   int vec_num;
5706   tree def0, tem;
5707   bool first_p = true;
5708   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5709   tree cond_reduc_val = NULL_TREE;
5710
5711   /* Make sure it was already recognized as a reduction computation.  */
5712   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5713       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5714     return false;
5715
5716   if (nested_in_vect_loop_p (loop, stmt))
5717     {
5718       outer_loop = loop;
5719       loop = loop->inner;
5720       nested_cycle = true;
5721     }
5722
5723   /* In case of reduction chain we switch to the first stmt in the chain, but
5724      we don't update STMT_INFO, since only the last stmt is marked as reduction
5725      and has reduction properties.  */
5726   if (GROUP_FIRST_ELEMENT (stmt_info)
5727       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5728     {
5729       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5730       first_p = false;
5731     }
5732
5733   if (gimple_code (stmt) == GIMPLE_PHI)
5734     {
5735       /* Analysis is fully done on the reduction stmt invocation.  */
5736       if (! vec_stmt)
5737         {
5738           if (slp_node)
5739             slp_node_instance->reduc_phis = slp_node;
5740
5741           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5742           return true;
5743         }
5744
5745       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5746       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5747         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5748
5749       gcc_assert (is_gimple_assign (reduc_stmt));
5750       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5751         {
5752           tree op = gimple_op (reduc_stmt, k);
5753           if (op == gimple_phi_result (stmt))
5754             continue;
5755           if (k == 1
5756               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5757             continue;
5758           if (!vectype_in
5759               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5760                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
5761             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
5762           break;
5763         }
5764       gcc_assert (vectype_in);
5765
5766       if (slp_node)
5767         ncopies = 1;
5768       else
5769         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5770
5771       use_operand_p use_p;
5772       gimple *use_stmt;
5773       if (ncopies > 1
5774           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5775               <= vect_used_only_live)
5776           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5777           && (use_stmt == reduc_stmt
5778               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5779                   == reduc_stmt)))
5780         single_defuse_cycle = true;
5781
5782       /* Create the destination vector  */
5783       scalar_dest = gimple_assign_lhs (reduc_stmt);
5784       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5785
5786       if (slp_node)
5787         /* The size vect_schedule_slp_instance computes is off for us.  */
5788         vec_num = vect_get_num_vectors
5789           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5790            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
5791            vectype_in);
5792       else
5793         vec_num = 1;
5794
5795       /* Generate the reduction PHIs upfront.  */
5796       prev_phi_info = NULL;
5797       for (j = 0; j < ncopies; j++)
5798         {
5799           if (j == 0 || !single_defuse_cycle)
5800             {
5801               for (i = 0; i < vec_num; i++)
5802                 {
5803                   /* Create the reduction-phi that defines the reduction
5804                      operand.  */
5805                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5806                   set_vinfo_for_stmt (new_phi,
5807                                       new_stmt_vec_info (new_phi, loop_vinfo));
5808
5809                   if (slp_node)
5810                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5811                   else
5812                     {
5813                       if (j == 0)
5814                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5815                       else
5816                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5817                       prev_phi_info = vinfo_for_stmt (new_phi);
5818                     }
5819                 }
5820             }
5821         }
5822
5823       return true;
5824     }
5825
5826   /* 1. Is vectorizable reduction?  */
5827   /* Not supportable if the reduction variable is used in the loop, unless
5828      it's a reduction chain.  */
5829   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5830       && !GROUP_FIRST_ELEMENT (stmt_info))
5831     return false;
5832
5833   /* Reductions that are not used even in an enclosing outer-loop,
5834      are expected to be "live" (used out of the loop).  */
5835   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5836       && !STMT_VINFO_LIVE_P (stmt_info))
5837     return false;
5838
5839   /* 2. Has this been recognized as a reduction pattern?
5840
5841      Check if STMT represents a pattern that has been recognized
5842      in earlier analysis stages.  For stmts that represent a pattern,
5843      the STMT_VINFO_RELATED_STMT field records the last stmt in
5844      the original sequence that constitutes the pattern.  */
5845
5846   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5847   if (orig_stmt)
5848     {
5849       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5850       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5851       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5852     }
5853
5854   /* 3. Check the operands of the operation.  The first operands are defined
5855         inside the loop body. The last operand is the reduction variable,
5856         which is defined by the loop-header-phi.  */
5857
5858   gcc_assert (is_gimple_assign (stmt));
5859
5860   /* Flatten RHS.  */
5861   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5862     {
5863     case GIMPLE_BINARY_RHS:
5864       code = gimple_assign_rhs_code (stmt);
5865       op_type = TREE_CODE_LENGTH (code);
5866       gcc_assert (op_type == binary_op);
5867       ops[0] = gimple_assign_rhs1 (stmt);
5868       ops[1] = gimple_assign_rhs2 (stmt);
5869       break;
5870
5871     case GIMPLE_TERNARY_RHS:
5872       code = gimple_assign_rhs_code (stmt);
5873       op_type = TREE_CODE_LENGTH (code);
5874       gcc_assert (op_type == ternary_op);
5875       ops[0] = gimple_assign_rhs1 (stmt);
5876       ops[1] = gimple_assign_rhs2 (stmt);
5877       ops[2] = gimple_assign_rhs3 (stmt);
5878       break;
5879
5880     case GIMPLE_UNARY_RHS:
5881       return false;
5882
5883     default:
5884       gcc_unreachable ();
5885     }
5886
5887   if (code == COND_EXPR && slp_node)
5888     return false;
5889
5890   scalar_dest = gimple_assign_lhs (stmt);
5891   scalar_type = TREE_TYPE (scalar_dest);
5892   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5893       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5894     return false;
5895
5896   /* Do not try to vectorize bit-precision reductions.  */
5897   if (!type_has_mode_precision_p (scalar_type))
5898     return false;
5899
5900   /* All uses but the last are expected to be defined in the loop.
5901      The last use is the reduction variable.  In case of nested cycle this
5902      assumption is not true: we use reduc_index to record the index of the
5903      reduction variable.  */
5904   gimple *reduc_def_stmt = NULL;
5905   int reduc_index = -1;
5906   for (i = 0; i < op_type; i++)
5907     {
5908       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5909       if (i == 0 && code == COND_EXPR)
5910         continue;
5911
5912       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5913                                           &def_stmt, &dts[i], &tem);
5914       dt = dts[i];
5915       gcc_assert (is_simple_use);
5916       if (dt == vect_reduction_def)
5917         {
5918           reduc_def_stmt = def_stmt;
5919           reduc_index = i;
5920           continue;
5921         }
5922       else if (tem)
5923         {
5924           /* To properly compute ncopies we are interested in the widest
5925              input type in case we're looking at a widening accumulation.  */
5926           if (!vectype_in
5927               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5928                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
5929             vectype_in = tem;
5930         }
5931
5932       if (dt != vect_internal_def
5933           && dt != vect_external_def
5934           && dt != vect_constant_def
5935           && dt != vect_induction_def
5936           && !(dt == vect_nested_cycle && nested_cycle))
5937         return false;
5938
5939       if (dt == vect_nested_cycle)
5940         {
5941           found_nested_cycle_def = true;
5942           reduc_def_stmt = def_stmt;
5943           reduc_index = i;
5944         }
5945
5946       if (i == 1 && code == COND_EXPR)
5947         {
5948           /* Record how value of COND_EXPR is defined.  */
5949           if (dt == vect_constant_def)
5950             {
5951               cond_reduc_dt = dt;
5952               cond_reduc_val = ops[i];
5953             }
5954           if (dt == vect_induction_def
5955               && def_stmt != NULL
5956               && is_nonwrapping_integer_induction (def_stmt, loop))
5957             {
5958               cond_reduc_dt = dt;
5959               cond_reduc_def_stmt = def_stmt;
5960             }
5961         }
5962     }
5963
5964   if (!vectype_in)
5965     vectype_in = vectype_out;
5966
5967   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5968      directy used in stmt.  */
5969   if (reduc_index == -1)
5970     {
5971       if (orig_stmt)
5972         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5973       else
5974         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5975     }
5976
5977   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5978     return false;
5979
5980   if (!(reduc_index == -1
5981         || dts[reduc_index] == vect_reduction_def
5982         || dts[reduc_index] == vect_nested_cycle
5983         || ((dts[reduc_index] == vect_internal_def
5984              || dts[reduc_index] == vect_external_def
5985              || dts[reduc_index] == vect_constant_def
5986              || dts[reduc_index] == vect_induction_def)
5987             && nested_cycle && found_nested_cycle_def)))
5988     {
5989       /* For pattern recognized stmts, orig_stmt might be a reduction,
5990          but some helper statements for the pattern might not, or
5991          might be COND_EXPRs with reduction uses in the condition.  */
5992       gcc_assert (orig_stmt);
5993       return false;
5994     }
5995
5996   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5997   enum vect_reduction_type v_reduc_type
5998     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5999   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6000
6001   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6002   /* If we have a condition reduction, see if we can simplify it further.  */
6003   if (v_reduc_type == COND_REDUCTION)
6004     {
6005       if (cond_reduc_dt == vect_induction_def)
6006         {
6007           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6008           tree base
6009             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6010           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6011
6012           gcc_assert (TREE_CODE (base) == INTEGER_CST
6013                       && TREE_CODE (step) == INTEGER_CST);
6014           cond_reduc_val = NULL_TREE;
6015           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6016              above base; punt if base is the minimum value of the type for
6017              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6018           if (tree_int_cst_sgn (step) == -1)
6019             {
6020               cond_reduc_op_code = MIN_EXPR;
6021               if (tree_int_cst_sgn (base) == -1)
6022                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6023               else if (tree_int_cst_lt (base,
6024                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6025                 cond_reduc_val
6026                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6027             }
6028           else
6029             {
6030               cond_reduc_op_code = MAX_EXPR;
6031               if (tree_int_cst_sgn (base) == 1)
6032                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6033               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6034                                         base))
6035                 cond_reduc_val
6036                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6037             }
6038           if (cond_reduc_val)
6039             {
6040               if (dump_enabled_p ())
6041                 dump_printf_loc (MSG_NOTE, vect_location,
6042                                  "condition expression based on "
6043                                  "integer induction.\n");
6044               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6045                 = INTEGER_INDUC_COND_REDUCTION;
6046             }
6047         }
6048
6049       /* Loop peeling modifies initial value of reduction PHI, which
6050          makes the reduction stmt to be transformed different to the
6051          original stmt analyzed.  We need to record reduction code for
6052          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6053          it can be used directly at transform stage.  */
6054       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6055           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6056         {
6057           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6058           gcc_assert (cond_reduc_dt == vect_constant_def);
6059           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6060         }
6061       else if (cond_reduc_dt == vect_constant_def)
6062         {
6063           enum vect_def_type cond_initial_dt;
6064           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6065           tree cond_initial_val
6066             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6067
6068           gcc_assert (cond_reduc_val != NULL_TREE);
6069           vect_is_simple_use (cond_initial_val, loop_vinfo,
6070                               &def_stmt, &cond_initial_dt);
6071           if (cond_initial_dt == vect_constant_def
6072               && types_compatible_p (TREE_TYPE (cond_initial_val),
6073                                      TREE_TYPE (cond_reduc_val)))
6074             {
6075               tree e = fold_binary (LE_EXPR, boolean_type_node,
6076                                     cond_initial_val, cond_reduc_val);
6077               if (e && (integer_onep (e) || integer_zerop (e)))
6078                 {
6079                   if (dump_enabled_p ())
6080                     dump_printf_loc (MSG_NOTE, vect_location,
6081                                      "condition expression based on "
6082                                      "compile time constant.\n");
6083                   /* Record reduction code at analysis stage.  */
6084                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6085                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6086                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6087                     = CONST_COND_REDUCTION;
6088                 }
6089             }
6090         }
6091     }
6092
6093   if (orig_stmt)
6094     gcc_assert (tmp == orig_stmt
6095                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6096   else
6097     /* We changed STMT to be the first stmt in reduction chain, hence we
6098        check that in this case the first element in the chain is STMT.  */
6099     gcc_assert (stmt == tmp
6100                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6101
6102   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6103     return false;
6104
6105   if (slp_node)
6106     ncopies = 1;
6107   else
6108     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6109
6110   gcc_assert (ncopies >= 1);
6111
6112   vec_mode = TYPE_MODE (vectype_in);
6113   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6114
6115   if (code == COND_EXPR)
6116     {
6117       /* Only call during the analysis stage, otherwise we'll lose
6118          STMT_VINFO_TYPE.  */
6119       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6120                                                 ops[reduc_index], 0, NULL))
6121         {
6122           if (dump_enabled_p ())
6123             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6124                              "unsupported condition in reduction\n");
6125           return false;
6126         }
6127     }
6128   else
6129     {
6130       /* 4. Supportable by target?  */
6131
6132       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6133           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6134         {
6135           /* Shifts and rotates are only supported by vectorizable_shifts,
6136              not vectorizable_reduction.  */
6137           if (dump_enabled_p ())
6138             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6139                              "unsupported shift or rotation.\n");
6140           return false;
6141         }
6142
6143       /* 4.1. check support for the operation in the loop  */
6144       optab = optab_for_tree_code (code, vectype_in, optab_default);
6145       if (!optab)
6146         {
6147           if (dump_enabled_p ())
6148             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6149                              "no optab.\n");
6150
6151           return false;
6152         }
6153
6154       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6155         {
6156           if (dump_enabled_p ())
6157             dump_printf (MSG_NOTE, "op not supported by target.\n");
6158
6159           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6160               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6161             return false;
6162
6163           if (dump_enabled_p ())
6164             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6165         }
6166
6167       /* Worthwhile without SIMD support?  */
6168       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6169           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6170         {
6171           if (dump_enabled_p ())
6172             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6173                              "not worthwhile without SIMD support.\n");
6174
6175           return false;
6176         }
6177     }
6178
6179   /* 4.2. Check support for the epilog operation.
6180
6181           If STMT represents a reduction pattern, then the type of the
6182           reduction variable may be different than the type of the rest
6183           of the arguments.  For example, consider the case of accumulation
6184           of shorts into an int accumulator; The original code:
6185                         S1: int_a = (int) short_a;
6186           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6187
6188           was replaced with:
6189                         STMT: int_acc = widen_sum <short_a, int_acc>
6190
6191           This means that:
6192           1. The tree-code that is used to create the vector operation in the
6193              epilog code (that reduces the partial results) is not the
6194              tree-code of STMT, but is rather the tree-code of the original
6195              stmt from the pattern that STMT is replacing.  I.e, in the example
6196              above we want to use 'widen_sum' in the loop, but 'plus' in the
6197              epilog.
6198           2. The type (mode) we use to check available target support
6199              for the vector operation to be created in the *epilog*, is
6200              determined by the type of the reduction variable (in the example
6201              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6202              However the type (mode) we use to check available target support
6203              for the vector operation to be created *inside the loop*, is
6204              determined by the type of the other arguments to STMT (in the
6205              example we'd check this: optab_handler (widen_sum_optab,
6206              vect_short_mode)).
6207
6208           This is contrary to "regular" reductions, in which the types of all
6209           the arguments are the same as the type of the reduction variable.
6210           For "regular" reductions we can therefore use the same vector type
6211           (and also the same tree-code) when generating the epilog code and
6212           when generating the code inside the loop.  */
6213
6214   if (orig_stmt)
6215     {
6216       /* This is a reduction pattern: get the vectype from the type of the
6217          reduction variable, and get the tree-code from orig_stmt.  */
6218       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6219                   == TREE_CODE_REDUCTION);
6220       orig_code = gimple_assign_rhs_code (orig_stmt);
6221       gcc_assert (vectype_out);
6222       vec_mode = TYPE_MODE (vectype_out);
6223     }
6224   else
6225     {
6226       /* Regular reduction: use the same vectype and tree-code as used for
6227          the vector code inside the loop can be used for the epilog code. */
6228       orig_code = code;
6229
6230       if (code == MINUS_EXPR)
6231         orig_code = PLUS_EXPR;
6232
6233       /* For simple condition reductions, replace with the actual expression
6234          we want to base our reduction around.  */
6235       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6236         {
6237           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6238           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6239         }
6240       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6241                == INTEGER_INDUC_COND_REDUCTION)
6242         orig_code = cond_reduc_op_code;
6243     }
6244
6245   if (nested_cycle)
6246     {
6247       def_bb = gimple_bb (reduc_def_stmt);
6248       def_stmt_loop = def_bb->loop_father;
6249       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6250                                        loop_preheader_edge (def_stmt_loop));
6251       if (TREE_CODE (def_arg) == SSA_NAME
6252           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6253           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6254           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6255           && vinfo_for_stmt (def_arg_stmt)
6256           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6257               == vect_double_reduction_def)
6258         double_reduc = true;
6259     }
6260
6261   reduc_fn = IFN_LAST;
6262
6263   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6264     {
6265       if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6266         {
6267           if (reduc_fn != IFN_LAST
6268               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6269                                                   OPTIMIZE_FOR_SPEED))
6270             {
6271               if (dump_enabled_p ())
6272                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6273                                  "reduc op not supported by target.\n");
6274
6275               reduc_fn = IFN_LAST;
6276             }
6277         }
6278       else
6279         {
6280           if (!nested_cycle || double_reduc)
6281             {
6282               if (dump_enabled_p ())
6283                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6284                                  "no reduc code for scalar code.\n");
6285
6286               return false;
6287             }
6288         }
6289     }
6290   else
6291     {
6292       int scalar_precision
6293         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6294       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6295       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6296                                                 nunits_out);
6297
6298       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6299                                           OPTIMIZE_FOR_SPEED))
6300         reduc_fn = IFN_REDUC_MAX;
6301     }
6302
6303   if (reduc_fn == IFN_LAST && !nunits_out.is_constant ())
6304     {
6305       if (dump_enabled_p ())
6306         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6307                          "missing target support for reduction on"
6308                          " variable-length vectors.\n");
6309       return false;
6310     }
6311
6312   if ((double_reduc
6313        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6314       && ncopies > 1)
6315     {
6316       if (dump_enabled_p ())
6317         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6318                          "multiple types in double reduction or condition "
6319                          "reduction.\n");
6320       return false;
6321     }
6322
6323   if (double_reduc && !nunits_out.is_constant ())
6324     {
6325       /* The current double-reduction code creates the initial value
6326          element-by-element.  */
6327       if (dump_enabled_p ())
6328         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6329                          "double reduction not supported for variable-length"
6330                          " vectors.\n");
6331       return false;
6332     }
6333
6334   if (slp_node && !nunits_out.is_constant ())
6335     {
6336       /* The current SLP code creates the initial value element-by-element.  */
6337       if (dump_enabled_p ())
6338         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6339                          "SLP reduction not supported for variable-length"
6340                          " vectors.\n");
6341       return false;
6342     }
6343
6344   /* In case of widenning multiplication by a constant, we update the type
6345      of the constant to be the type of the other operand.  We check that the
6346      constant fits the type in the pattern recognition pass.  */
6347   if (code == DOT_PROD_EXPR
6348       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6349     {
6350       if (TREE_CODE (ops[0]) == INTEGER_CST)
6351         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6352       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6353         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6354       else
6355         {
6356           if (dump_enabled_p ())
6357             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6358                              "invalid types in dot-prod\n");
6359
6360           return false;
6361         }
6362     }
6363
6364   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6365     {
6366       widest_int ni;
6367
6368       if (! max_loop_iterations (loop, &ni))
6369         {
6370           if (dump_enabled_p ())
6371             dump_printf_loc (MSG_NOTE, vect_location,
6372                              "loop count not known, cannot create cond "
6373                              "reduction.\n");
6374           return false;
6375         }
6376       /* Convert backedges to iterations.  */
6377       ni += 1;
6378
6379       /* The additional index will be the same type as the condition.  Check
6380          that the loop can fit into this less one (because we'll use up the
6381          zero slot for when there are no matches).  */
6382       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6383       if (wi::geu_p (ni, wi::to_widest (max_index)))
6384         {
6385           if (dump_enabled_p ())
6386             dump_printf_loc (MSG_NOTE, vect_location,
6387                              "loop size is greater than data size.\n");
6388           return false;
6389         }
6390     }
6391
6392   /* In case the vectorization factor (VF) is bigger than the number
6393      of elements that we can fit in a vectype (nunits), we have to generate
6394      more than one vector stmt - i.e - we need to "unroll" the
6395      vector stmt by a factor VF/nunits.  For more details see documentation
6396      in vectorizable_operation.  */
6397
6398   /* If the reduction is used in an outer loop we need to generate
6399      VF intermediate results, like so (e.g. for ncopies=2):
6400         r0 = phi (init, r0)
6401         r1 = phi (init, r1)
6402         r0 = x0 + r0;
6403         r1 = x1 + r1;
6404     (i.e. we generate VF results in 2 registers).
6405     In this case we have a separate def-use cycle for each copy, and therefore
6406     for each copy we get the vector def for the reduction variable from the
6407     respective phi node created for this copy.
6408
6409     Otherwise (the reduction is unused in the loop nest), we can combine
6410     together intermediate results, like so (e.g. for ncopies=2):
6411         r = phi (init, r)
6412         r = x0 + r;
6413         r = x1 + r;
6414    (i.e. we generate VF/2 results in a single register).
6415    In this case for each copy we get the vector def for the reduction variable
6416    from the vectorized reduction operation generated in the previous iteration.
6417
6418    This only works when we see both the reduction PHI and its only consumer
6419    in vectorizable_reduction and there are no intermediate stmts
6420    participating.  */
6421   use_operand_p use_p;
6422   gimple *use_stmt;
6423   if (ncopies > 1
6424       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6425       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6426       && (use_stmt == stmt
6427           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6428     {
6429       single_defuse_cycle = true;
6430       epilog_copies = 1;
6431     }
6432   else
6433     epilog_copies = ncopies;
6434
6435   /* If the reduction stmt is one of the patterns that have lane
6436      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6437   if ((ncopies > 1
6438        && ! single_defuse_cycle)
6439       && (code == DOT_PROD_EXPR
6440           || code == WIDEN_SUM_EXPR
6441           || code == SAD_EXPR))
6442     {
6443       if (dump_enabled_p ())
6444         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6445                          "multi def-use cycle not possible for lane-reducing "
6446                          "reduction operation\n");
6447       return false;
6448     }
6449
6450   if (!vec_stmt) /* transformation not required.  */
6451     {
6452       if (first_p)
6453         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6454       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6455       return true;
6456     }
6457
6458   /* Transform.  */
6459
6460   if (dump_enabled_p ())
6461     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6462
6463   /* FORNOW: Multiple types are not supported for condition.  */
6464   if (code == COND_EXPR)
6465     gcc_assert (ncopies == 1);
6466
6467   /* Create the destination vector  */
6468   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6469
6470   prev_stmt_info = NULL;
6471   prev_phi_info = NULL;
6472   if (slp_node)
6473     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6474   else
6475     {
6476       vec_num = 1;
6477       vec_oprnds0.create (1);
6478       vec_oprnds1.create (1);
6479       if (op_type == ternary_op)
6480         vec_oprnds2.create (1);
6481     }
6482
6483   phis.create (vec_num);
6484   vect_defs.create (vec_num);
6485   if (!slp_node)
6486     vect_defs.quick_push (NULL_TREE);
6487
6488   if (slp_node)
6489     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6490   else
6491     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6492
6493   for (j = 0; j < ncopies; j++)
6494     {
6495       if (code == COND_EXPR)
6496         {
6497           gcc_assert (!slp_node);
6498           vectorizable_condition (stmt, gsi, vec_stmt,
6499                                   PHI_RESULT (phis[0]),
6500                                   reduc_index, NULL);
6501           /* Multiple types are not supported for condition.  */
6502           break;
6503         }
6504
6505       /* Handle uses.  */
6506       if (j == 0)
6507         {
6508           if (slp_node)
6509             {
6510               /* Get vec defs for all the operands except the reduction index,
6511                  ensuring the ordering of the ops in the vector is kept.  */
6512               auto_vec<tree, 3> slp_ops;
6513               auto_vec<vec<tree>, 3> vec_defs;
6514
6515               slp_ops.quick_push (ops[0]);
6516               slp_ops.quick_push (ops[1]);
6517               if (op_type == ternary_op)
6518                 slp_ops.quick_push (ops[2]);
6519
6520               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6521
6522               vec_oprnds0.safe_splice (vec_defs[0]);
6523               vec_defs[0].release ();
6524               vec_oprnds1.safe_splice (vec_defs[1]);
6525               vec_defs[1].release ();
6526               if (op_type == ternary_op)
6527                 {
6528                   vec_oprnds2.safe_splice (vec_defs[2]);
6529                   vec_defs[2].release ();
6530                 }
6531             }
6532           else
6533             {
6534               vec_oprnds0.quick_push
6535                 (vect_get_vec_def_for_operand (ops[0], stmt));
6536               vec_oprnds1.quick_push
6537                 (vect_get_vec_def_for_operand (ops[1], stmt));
6538               if (op_type == ternary_op)
6539                 vec_oprnds2.quick_push
6540                   (vect_get_vec_def_for_operand (ops[2], stmt));
6541             }
6542         }
6543       else
6544         {
6545           if (!slp_node)
6546             {
6547               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6548
6549               if (single_defuse_cycle && reduc_index == 0)
6550                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6551               else
6552                 vec_oprnds0[0]
6553                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6554               if (single_defuse_cycle && reduc_index == 1)
6555                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6556               else
6557                 vec_oprnds1[0]
6558                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6559               if (op_type == ternary_op)
6560                 {
6561                   if (single_defuse_cycle && reduc_index == 2)
6562                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6563                   else
6564                     vec_oprnds2[0]
6565                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6566                 }
6567             }
6568         }
6569
6570       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6571         {
6572           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6573           if (op_type == ternary_op)
6574             vop[2] = vec_oprnds2[i];
6575
6576           new_temp = make_ssa_name (vec_dest, new_stmt);
6577           new_stmt = gimple_build_assign (new_temp, code,
6578                                           vop[0], vop[1], vop[2]);
6579           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6580
6581           if (slp_node)
6582             {
6583               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6584               vect_defs.quick_push (new_temp);
6585             }
6586           else
6587             vect_defs[0] = new_temp;
6588         }
6589
6590       if (slp_node)
6591         continue;
6592
6593       if (j == 0)
6594         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6595       else
6596         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6597
6598       prev_stmt_info = vinfo_for_stmt (new_stmt);
6599     }
6600
6601   /* Finalize the reduction-phi (set its arguments) and create the
6602      epilog reduction code.  */
6603   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6604     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6605
6606   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6607                                     epilog_copies, reduc_fn, phis,
6608                                     double_reduc, slp_node, slp_node_instance,
6609                                     cond_reduc_val, cond_reduc_op_code);
6610
6611   return true;
6612 }
6613
6614 /* Function vect_min_worthwhile_factor.
6615
6616    For a loop where we could vectorize the operation indicated by CODE,
6617    return the minimum vectorization factor that makes it worthwhile
6618    to use generic vectors.  */
6619 static unsigned int
6620 vect_min_worthwhile_factor (enum tree_code code)
6621 {
6622   switch (code)
6623     {
6624     case PLUS_EXPR:
6625     case MINUS_EXPR:
6626     case NEGATE_EXPR:
6627       return 4;
6628
6629     case BIT_AND_EXPR:
6630     case BIT_IOR_EXPR:
6631     case BIT_XOR_EXPR:
6632     case BIT_NOT_EXPR:
6633       return 2;
6634
6635     default:
6636       return INT_MAX;
6637     }
6638 }
6639
6640 /* Return true if VINFO indicates we are doing loop vectorization and if
6641    it is worth decomposing CODE operations into scalar operations for
6642    that loop's vectorization factor.  */
6643
6644 bool
6645 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6646 {
6647   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6648   unsigned HOST_WIDE_INT value;
6649   return (loop_vinfo
6650           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
6651           && value >= vect_min_worthwhile_factor (code));
6652 }
6653
6654 /* Function vectorizable_induction
6655
6656    Check if PHI performs an induction computation that can be vectorized.
6657    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6658    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6659    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6660
6661 bool
6662 vectorizable_induction (gimple *phi,
6663                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6664                         gimple **vec_stmt, slp_tree slp_node)
6665 {
6666   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6667   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6668   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6669   unsigned ncopies;
6670   bool nested_in_vect_loop = false;
6671   struct loop *iv_loop;
6672   tree vec_def;
6673   edge pe = loop_preheader_edge (loop);
6674   basic_block new_bb;
6675   tree new_vec, vec_init, vec_step, t;
6676   tree new_name;
6677   gimple *new_stmt;
6678   gphi *induction_phi;
6679   tree induc_def, vec_dest;
6680   tree init_expr, step_expr;
6681   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6682   unsigned i;
6683   tree expr;
6684   gimple_seq stmts;
6685   imm_use_iterator imm_iter;
6686   use_operand_p use_p;
6687   gimple *exit_phi;
6688   edge latch_e;
6689   tree loop_arg;
6690   gimple_stmt_iterator si;
6691   basic_block bb = gimple_bb (phi);
6692
6693   if (gimple_code (phi) != GIMPLE_PHI)
6694     return false;
6695
6696   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6697     return false;
6698
6699   /* Make sure it was recognized as induction computation.  */
6700   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6701     return false;
6702
6703   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6704   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6705
6706   if (slp_node)
6707     ncopies = 1;
6708   else
6709     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6710   gcc_assert (ncopies >= 1);
6711
6712   /* FORNOW. These restrictions should be relaxed.  */
6713   if (nested_in_vect_loop_p (loop, phi))
6714     {
6715       imm_use_iterator imm_iter;
6716       use_operand_p use_p;
6717       gimple *exit_phi;
6718       edge latch_e;
6719       tree loop_arg;
6720
6721       if (ncopies > 1)
6722         {
6723           if (dump_enabled_p ())
6724             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6725                              "multiple types in nested loop.\n");
6726           return false;
6727         }
6728
6729       /* FORNOW: outer loop induction with SLP not supported.  */
6730       if (STMT_SLP_TYPE (stmt_info))
6731         return false;
6732
6733       exit_phi = NULL;
6734       latch_e = loop_latch_edge (loop->inner);
6735       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6736       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6737         {
6738           gimple *use_stmt = USE_STMT (use_p);
6739           if (is_gimple_debug (use_stmt))
6740             continue;
6741
6742           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6743             {
6744               exit_phi = use_stmt;
6745               break;
6746             }
6747         }
6748       if (exit_phi)
6749         {
6750           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6751           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6752                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6753             {
6754               if (dump_enabled_p ())
6755                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6756                                  "inner-loop induction only used outside "
6757                                  "of the outer vectorized loop.\n");
6758               return false;
6759             }
6760         }
6761
6762       nested_in_vect_loop = true;
6763       iv_loop = loop->inner;
6764     }
6765   else
6766     iv_loop = loop;
6767   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6768
6769   if (slp_node && !nunits.is_constant ())
6770     {
6771       /* The current SLP code creates the initial value element-by-element.  */
6772       if (dump_enabled_p ())
6773         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6774                          "SLP induction not supported for variable-length"
6775                          " vectors.\n");
6776       return false;
6777     }
6778
6779   if (!vec_stmt) /* transformation not required.  */
6780     {
6781       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6782       if (dump_enabled_p ())
6783         dump_printf_loc (MSG_NOTE, vect_location,
6784                          "=== vectorizable_induction ===\n");
6785       vect_model_induction_cost (stmt_info, ncopies);
6786       return true;
6787     }
6788
6789   /* Transform.  */
6790
6791   /* Compute a vector variable, initialized with the first VF values of
6792      the induction variable.  E.g., for an iv with IV_PHI='X' and
6793      evolution S, for a vector of 4 units, we want to compute:
6794      [X, X + S, X + 2*S, X + 3*S].  */
6795
6796   if (dump_enabled_p ())
6797     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6798
6799   latch_e = loop_latch_edge (iv_loop);
6800   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6801
6802   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6803   gcc_assert (step_expr != NULL_TREE);
6804
6805   pe = loop_preheader_edge (iv_loop);
6806   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6807                                      loop_preheader_edge (iv_loop));
6808
6809   /* Convert the step to the desired type.  */
6810   stmts = NULL;
6811   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6812   if (stmts)
6813     {
6814       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6815       gcc_assert (!new_bb);
6816     }
6817
6818   /* Find the first insertion point in the BB.  */
6819   si = gsi_after_labels (bb);
6820
6821   /* For SLP induction we have to generate several IVs as for example
6822      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6823      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6824      [VF*S, VF*S, VF*S, VF*S] for all.  */
6825   if (slp_node)
6826     {
6827       /* Enforced above.  */
6828       unsigned int const_nunits = nunits.to_constant ();
6829
6830       /* Convert the init to the desired type.  */
6831       stmts = NULL;
6832       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6833       if (stmts)
6834         {
6835           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6836           gcc_assert (!new_bb);
6837         }
6838
6839       /* Generate [VF*S, VF*S, ... ].  */
6840       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6841         {
6842           expr = build_int_cst (integer_type_node, vf);
6843           expr = fold_convert (TREE_TYPE (step_expr), expr);
6844         }
6845       else
6846         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6847       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6848                               expr, step_expr);
6849       if (! CONSTANT_CLASS_P (new_name))
6850         new_name = vect_init_vector (phi, new_name,
6851                                      TREE_TYPE (step_expr), NULL);
6852       new_vec = build_vector_from_val (vectype, new_name);
6853       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6854
6855       /* Now generate the IVs.  */
6856       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6857       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6858       unsigned elts = const_nunits * nvects;
6859       unsigned nivs = least_common_multiple (group_size,
6860                                              const_nunits) / const_nunits;
6861       gcc_assert (elts % group_size == 0);
6862       tree elt = init_expr;
6863       unsigned ivn;
6864       for (ivn = 0; ivn < nivs; ++ivn)
6865         {
6866           tree_vector_builder elts (vectype, const_nunits, 1);
6867           stmts = NULL;
6868           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
6869             {
6870               if (ivn*const_nunits + eltn >= group_size
6871                   && (ivn * const_nunits + eltn) % group_size == 0)
6872                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6873                                     elt, step_expr);
6874               elts.quick_push (elt);
6875             }
6876           vec_init = gimple_build_vector (&stmts, &elts);
6877           if (stmts)
6878             {
6879               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6880               gcc_assert (!new_bb);
6881             }
6882
6883           /* Create the induction-phi that defines the induction-operand.  */
6884           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6885           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6886           set_vinfo_for_stmt (induction_phi,
6887                               new_stmt_vec_info (induction_phi, loop_vinfo));
6888           induc_def = PHI_RESULT (induction_phi);
6889
6890           /* Create the iv update inside the loop  */
6891           vec_def = make_ssa_name (vec_dest);
6892           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6893           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6894           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6895
6896           /* Set the arguments of the phi node:  */
6897           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6898           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6899                        UNKNOWN_LOCATION);
6900
6901           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6902         }
6903
6904       /* Re-use IVs when we can.  */
6905       if (ivn < nvects)
6906         {
6907           unsigned vfp
6908             = least_common_multiple (group_size, const_nunits) / group_size;
6909           /* Generate [VF'*S, VF'*S, ... ].  */
6910           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6911             {
6912               expr = build_int_cst (integer_type_node, vfp);
6913               expr = fold_convert (TREE_TYPE (step_expr), expr);
6914             }
6915           else
6916             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6917           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6918                                   expr, step_expr);
6919           if (! CONSTANT_CLASS_P (new_name))
6920             new_name = vect_init_vector (phi, new_name,
6921                                          TREE_TYPE (step_expr), NULL);
6922           new_vec = build_vector_from_val (vectype, new_name);
6923           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6924           for (; ivn < nvects; ++ivn)
6925             {
6926               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6927               tree def;
6928               if (gimple_code (iv) == GIMPLE_PHI)
6929                 def = gimple_phi_result (iv);
6930               else
6931                 def = gimple_assign_lhs (iv);
6932               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6933                                               PLUS_EXPR,
6934                                               def, vec_step);
6935               if (gimple_code (iv) == GIMPLE_PHI)
6936                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6937               else
6938                 {
6939                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6940                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6941                 }
6942               set_vinfo_for_stmt (new_stmt,
6943                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6944               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6945             }
6946         }
6947
6948       return true;
6949     }
6950
6951   /* Create the vector that holds the initial_value of the induction.  */
6952   if (nested_in_vect_loop)
6953     {
6954       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6955          been created during vectorization of previous stmts.  We obtain it
6956          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6957       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6958       /* If the initial value is not of proper type, convert it.  */
6959       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6960         {
6961           new_stmt
6962             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6963                                                           vect_simple_var,
6964                                                           "vec_iv_"),
6965                                    VIEW_CONVERT_EXPR,
6966                                    build1 (VIEW_CONVERT_EXPR, vectype,
6967                                            vec_init));
6968           vec_init = gimple_assign_lhs (new_stmt);
6969           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6970                                                  new_stmt);
6971           gcc_assert (!new_bb);
6972           set_vinfo_for_stmt (new_stmt,
6973                               new_stmt_vec_info (new_stmt, loop_vinfo));
6974         }
6975     }
6976   else
6977     {
6978       /* iv_loop is the loop to be vectorized. Create:
6979          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6980       stmts = NULL;
6981       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6982
6983       unsigned HOST_WIDE_INT const_nunits;
6984       if (nunits.is_constant (&const_nunits))
6985         {
6986           tree_vector_builder elts (vectype, const_nunits, 1);
6987           elts.quick_push (new_name);
6988           for (i = 1; i < const_nunits; i++)
6989             {
6990               /* Create: new_name_i = new_name + step_expr  */
6991               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6992                                        new_name, step_expr);
6993               elts.quick_push (new_name);
6994             }
6995           /* Create a vector from [new_name_0, new_name_1, ...,
6996              new_name_nunits-1]  */
6997           vec_init = gimple_build_vector (&stmts, &elts);
6998         }
6999       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7000         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7001         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7002                                  new_name, step_expr);
7003       else
7004         {
7005           /* Build:
7006                 [base, base, base, ...]
7007                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7008           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7009           gcc_assert (flag_associative_math);
7010           tree index = build_index_vector (vectype, 0, 1);
7011           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7012                                                         new_name);
7013           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7014                                                         step_expr);
7015           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7016           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7017                                    vec_init, step_vec);
7018           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7019                                    vec_init, base_vec);
7020         }
7021
7022       if (stmts)
7023         {
7024           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7025           gcc_assert (!new_bb);
7026         }
7027     }
7028
7029
7030   /* Create the vector that holds the step of the induction.  */
7031   if (nested_in_vect_loop)
7032     /* iv_loop is nested in the loop to be vectorized. Generate:
7033        vec_step = [S, S, S, S]  */
7034     new_name = step_expr;
7035   else
7036     {
7037       /* iv_loop is the loop to be vectorized. Generate:
7038           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7039       gimple_seq seq = NULL;
7040       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7041         {
7042           expr = build_int_cst (integer_type_node, vf);
7043           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7044         }
7045       else
7046         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7047       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7048                                expr, step_expr);
7049       if (seq)
7050         {
7051           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7052           gcc_assert (!new_bb);
7053         }
7054     }
7055
7056   t = unshare_expr (new_name);
7057   gcc_assert (CONSTANT_CLASS_P (new_name)
7058               || TREE_CODE (new_name) == SSA_NAME);
7059   new_vec = build_vector_from_val (vectype, t);
7060   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7061
7062
7063   /* Create the following def-use cycle:
7064      loop prolog:
7065          vec_init = ...
7066          vec_step = ...
7067      loop:
7068          vec_iv = PHI <vec_init, vec_loop>
7069          ...
7070          STMT
7071          ...
7072          vec_loop = vec_iv + vec_step;  */
7073
7074   /* Create the induction-phi that defines the induction-operand.  */
7075   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7076   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7077   set_vinfo_for_stmt (induction_phi,
7078                       new_stmt_vec_info (induction_phi, loop_vinfo));
7079   induc_def = PHI_RESULT (induction_phi);
7080
7081   /* Create the iv update inside the loop  */
7082   vec_def = make_ssa_name (vec_dest);
7083   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7084   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7085   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7086
7087   /* Set the arguments of the phi node:  */
7088   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7089   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7090                UNKNOWN_LOCATION);
7091
7092   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7093
7094   /* In case that vectorization factor (VF) is bigger than the number
7095      of elements that we can fit in a vectype (nunits), we have to generate
7096      more than one vector stmt - i.e - we need to "unroll" the
7097      vector stmt by a factor VF/nunits.  For more details see documentation
7098      in vectorizable_operation.  */
7099
7100   if (ncopies > 1)
7101     {
7102       gimple_seq seq = NULL;
7103       stmt_vec_info prev_stmt_vinfo;
7104       /* FORNOW. This restriction should be relaxed.  */
7105       gcc_assert (!nested_in_vect_loop);
7106
7107       /* Create the vector that holds the step of the induction.  */
7108       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7109         {
7110           expr = build_int_cst (integer_type_node, nunits);
7111           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7112         }
7113       else
7114         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7115       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7116                                expr, step_expr);
7117       if (seq)
7118         {
7119           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7120           gcc_assert (!new_bb);
7121         }
7122
7123       t = unshare_expr (new_name);
7124       gcc_assert (CONSTANT_CLASS_P (new_name)
7125                   || TREE_CODE (new_name) == SSA_NAME);
7126       new_vec = build_vector_from_val (vectype, t);
7127       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7128
7129       vec_def = induc_def;
7130       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7131       for (i = 1; i < ncopies; i++)
7132         {
7133           /* vec_i = vec_prev + vec_step  */
7134           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7135                                           vec_def, vec_step);
7136           vec_def = make_ssa_name (vec_dest, new_stmt);
7137           gimple_assign_set_lhs (new_stmt, vec_def);
7138
7139           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7140           set_vinfo_for_stmt (new_stmt,
7141                               new_stmt_vec_info (new_stmt, loop_vinfo));
7142           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7143           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7144         }
7145     }
7146
7147   if (nested_in_vect_loop)
7148     {
7149       /* Find the loop-closed exit-phi of the induction, and record
7150          the final vector of induction results:  */
7151       exit_phi = NULL;
7152       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7153         {
7154           gimple *use_stmt = USE_STMT (use_p);
7155           if (is_gimple_debug (use_stmt))
7156             continue;
7157
7158           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7159             {
7160               exit_phi = use_stmt;
7161               break;
7162             }
7163         }
7164       if (exit_phi)
7165         {
7166           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7167           /* FORNOW. Currently not supporting the case that an inner-loop induction
7168              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7169           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7170                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7171
7172           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7173           if (dump_enabled_p ())
7174             {
7175               dump_printf_loc (MSG_NOTE, vect_location,
7176                                "vector of inductions after inner-loop:");
7177               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7178             }
7179         }
7180     }
7181
7182
7183   if (dump_enabled_p ())
7184     {
7185       dump_printf_loc (MSG_NOTE, vect_location,
7186                        "transform induction: created def-use cycle: ");
7187       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7188       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7189                         SSA_NAME_DEF_STMT (vec_def), 0);
7190     }
7191
7192   return true;
7193 }
7194
7195 /* Function vectorizable_live_operation.
7196
7197    STMT computes a value that is used outside the loop.  Check if
7198    it can be supported.  */
7199
7200 bool
7201 vectorizable_live_operation (gimple *stmt,
7202                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7203                              slp_tree slp_node, int slp_index,
7204                              gimple **vec_stmt)
7205 {
7206   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7207   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7208   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7209   imm_use_iterator imm_iter;
7210   tree lhs, lhs_type, bitsize, vec_bitsize;
7211   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7212   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7213   int ncopies;
7214   gimple *use_stmt;
7215   auto_vec<tree> vec_oprnds;
7216   int vec_entry = 0;
7217   poly_uint64 vec_index = 0;
7218
7219   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7220
7221   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7222     return false;
7223
7224   /* FORNOW.  CHECKME.  */
7225   if (nested_in_vect_loop_p (loop, stmt))
7226     return false;
7227
7228   /* If STMT is not relevant and it is a simple assignment and its inputs are
7229      invariant then it can remain in place, unvectorized.  The original last
7230      scalar value that it computes will be used.  */
7231   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7232     {
7233       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7234       if (dump_enabled_p ())
7235         dump_printf_loc (MSG_NOTE, vect_location,
7236                          "statement is simple and uses invariant.  Leaving in "
7237                          "place.\n");
7238       return true;
7239     }
7240
7241   if (slp_node)
7242     ncopies = 1;
7243   else
7244     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7245
7246   if (slp_node)
7247     {
7248       gcc_assert (slp_index >= 0);
7249
7250       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7251       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7252
7253       /* Get the last occurrence of the scalar index from the concatenation of
7254          all the slp vectors. Calculate which slp vector it is and the index
7255          within.  */
7256       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7257
7258       /* Calculate which vector contains the result, and which lane of
7259          that vector we need.  */
7260       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7261         {
7262           if (dump_enabled_p ())
7263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7264                              "Cannot determine which vector holds the"
7265                              " final result.\n");
7266           return false;
7267         }
7268     }
7269
7270   if (!vec_stmt)
7271     /* No transformation required.  */
7272     return true;
7273
7274   /* If stmt has a related stmt, then use that for getting the lhs.  */
7275   if (is_pattern_stmt_p (stmt_info))
7276     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7277
7278   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7279         : gimple_get_lhs (stmt);
7280   lhs_type = TREE_TYPE (lhs);
7281
7282   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7283              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7284              : TYPE_SIZE (TREE_TYPE (vectype)));
7285   vec_bitsize = TYPE_SIZE (vectype);
7286
7287   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7288   tree vec_lhs, bitstart;
7289   if (slp_node)
7290     {
7291       /* Get the correct slp vectorized stmt.  */
7292       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7293
7294       /* Get entry to use.  */
7295       bitstart = bitsize_int (vec_index);
7296       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7297     }
7298   else
7299     {
7300       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7301       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7302
7303       /* For multiple copies, get the last copy.  */
7304       for (int i = 1; i < ncopies; ++i)
7305         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7306                                                   vec_lhs);
7307
7308       /* Get the last lane in the vector.  */
7309       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7310     }
7311
7312   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7313      loop.  */
7314   gimple_seq stmts = NULL;
7315   tree bftype = TREE_TYPE (vectype);
7316   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7317     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7318   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7319   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7320                                    true, NULL_TREE);
7321   if (stmts)
7322     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7323
7324   /* Replace use of lhs with newly computed result.  If the use stmt is a
7325      single arg PHI, just replace all uses of PHI result.  It's necessary
7326      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7327   use_operand_p use_p;
7328   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7329     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7330         && !is_gimple_debug (use_stmt))
7331     {
7332       if (gimple_code (use_stmt) == GIMPLE_PHI
7333           && gimple_phi_num_args (use_stmt) == 1)
7334         {
7335           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7336         }
7337       else
7338         {
7339           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7340             SET_USE (use_p, new_tree);
7341         }
7342       update_stmt (use_stmt);
7343     }
7344
7345   return true;
7346 }
7347
7348 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7349
7350 static void
7351 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7352 {
7353   ssa_op_iter op_iter;
7354   imm_use_iterator imm_iter;
7355   def_operand_p def_p;
7356   gimple *ustmt;
7357
7358   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7359     {
7360       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7361         {
7362           basic_block bb;
7363
7364           if (!is_gimple_debug (ustmt))
7365             continue;
7366
7367           bb = gimple_bb (ustmt);
7368
7369           if (!flow_bb_inside_loop_p (loop, bb))
7370             {
7371               if (gimple_debug_bind_p (ustmt))
7372                 {
7373                   if (dump_enabled_p ())
7374                     dump_printf_loc (MSG_NOTE, vect_location,
7375                                      "killing debug use\n");
7376
7377                   gimple_debug_bind_reset_value (ustmt);
7378                   update_stmt (ustmt);
7379                 }
7380               else
7381                 gcc_unreachable ();
7382             }
7383         }
7384     }
7385 }
7386
7387 /* Given loop represented by LOOP_VINFO, return true if computation of
7388    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7389    otherwise.  */
7390
7391 static bool
7392 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7393 {
7394   /* Constant case.  */
7395   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7396     {
7397       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7398       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7399
7400       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7401       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7402       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7403         return true;
7404     }
7405
7406   widest_int max;
7407   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7408   /* Check the upper bound of loop niters.  */
7409   if (get_max_loop_iterations (loop, &max))
7410     {
7411       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7412       signop sgn = TYPE_SIGN (type);
7413       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7414       if (max < type_max)
7415         return true;
7416     }
7417   return false;
7418 }
7419
7420 /* Scale profiling counters by estimation for LOOP which is vectorized
7421    by factor VF.  */
7422
7423 static void
7424 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7425 {
7426   edge preheader = loop_preheader_edge (loop);
7427   /* Reduce loop iterations by the vectorization factor.  */
7428   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7429   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7430
7431   if (freq_h.nonzero_p ())
7432     {
7433       profile_probability p;
7434
7435       /* Avoid dropping loop body profile counter to 0 because of zero count
7436          in loop's preheader.  */
7437       if (!(freq_e == profile_count::zero ()))
7438         freq_e = freq_e.force_nonzero ();
7439       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7440       scale_loop_frequencies (loop, p);
7441     }
7442
7443   edge exit_e = single_exit (loop);
7444   exit_e->probability = profile_probability::always ()
7445                                  .apply_scale (1, new_est_niter + 1);
7446
7447   edge exit_l = single_pred_edge (loop->latch);
7448   profile_probability prob = exit_l->probability;
7449   exit_l->probability = exit_e->probability.invert ();
7450   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7451     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7452 }
7453
7454 /* Function vect_transform_loop.
7455
7456    The analysis phase has determined that the loop is vectorizable.
7457    Vectorize the loop - created vectorized stmts to replace the scalar
7458    stmts in the loop, and update the loop exit condition.
7459    Returns scalar epilogue loop if any.  */
7460
7461 struct loop *
7462 vect_transform_loop (loop_vec_info loop_vinfo)
7463 {
7464   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7465   struct loop *epilogue = NULL;
7466   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7467   int nbbs = loop->num_nodes;
7468   int i;
7469   tree niters_vector = NULL_TREE;
7470   tree step_vector = NULL_TREE;
7471   tree niters_vector_mult_vf = NULL_TREE;
7472   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7473   unsigned int lowest_vf = constant_lower_bound (vf);
7474   bool grouped_store;
7475   bool slp_scheduled = false;
7476   gimple *stmt, *pattern_stmt;
7477   gimple_seq pattern_def_seq = NULL;
7478   gimple_stmt_iterator pattern_def_si = gsi_none ();
7479   bool transform_pattern_stmt = false;
7480   bool check_profitability = false;
7481   unsigned int th;
7482
7483   if (dump_enabled_p ())
7484     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7485
7486   /* Use the more conservative vectorization threshold.  If the number
7487      of iterations is constant assume the cost check has been performed
7488      by our caller.  If the threshold makes all loops profitable that
7489      run at least the (estimated) vectorization factor number of times
7490      checking is pointless, too.  */
7491   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7492   if (th >= vect_vf_for_cost (loop_vinfo)
7493       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7494     {
7495       if (dump_enabled_p ())
7496         dump_printf_loc (MSG_NOTE, vect_location,
7497                          "Profitability threshold is %d loop iterations.\n",
7498                          th);
7499       check_profitability = true;
7500     }
7501
7502   /* Make sure there exists a single-predecessor exit bb.  Do this before
7503      versioning.   */
7504   edge e = single_exit (loop);
7505   if (! single_pred_p (e->dest))
7506     {
7507       split_loop_exit_edge (e);
7508       if (dump_enabled_p ())
7509         dump_printf (MSG_NOTE, "split exit edge\n");
7510     }
7511
7512   /* Version the loop first, if required, so the profitability check
7513      comes first.  */
7514
7515   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7516     {
7517       poly_uint64 versioning_threshold
7518         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
7519       if (check_profitability
7520           && ordered_p (poly_uint64 (th), versioning_threshold))
7521         {
7522           versioning_threshold = ordered_max (poly_uint64 (th),
7523                                               versioning_threshold);
7524           check_profitability = false;
7525         }
7526       vect_loop_versioning (loop_vinfo, th, check_profitability,
7527                             versioning_threshold);
7528       check_profitability = false;
7529     }
7530
7531   /* Make sure there exists a single-predecessor exit bb also on the
7532      scalar loop copy.  Do this after versioning but before peeling
7533      so CFG structure is fine for both scalar and if-converted loop
7534      to make slpeel_duplicate_current_defs_from_edges face matched
7535      loop closed PHI nodes on the exit.  */
7536   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7537     {
7538       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7539       if (! single_pred_p (e->dest))
7540         {
7541           split_loop_exit_edge (e);
7542           if (dump_enabled_p ())
7543             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7544         }
7545     }
7546
7547   tree niters = vect_build_loop_niters (loop_vinfo);
7548   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7549   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7550   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7551   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
7552                               &step_vector, &niters_vector_mult_vf, th,
7553                               check_profitability, niters_no_overflow);
7554   if (niters_vector == NULL_TREE)
7555     {
7556       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && known_eq (lowest_vf, vf))
7557         {
7558           niters_vector
7559             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7560                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
7561           step_vector = build_one_cst (TREE_TYPE (niters));
7562         }
7563       else
7564         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7565                                      &step_vector, niters_no_overflow);
7566     }
7567
7568   /* 1) Make sure the loop header has exactly two entries
7569      2) Make sure we have a preheader basic block.  */
7570
7571   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7572
7573   split_edge (loop_preheader_edge (loop));
7574
7575   /* FORNOW: the vectorizer supports only loops which body consist
7576      of one basic block (header + empty latch). When the vectorizer will
7577      support more involved loop forms, the order by which the BBs are
7578      traversed need to be reconsidered.  */
7579
7580   for (i = 0; i < nbbs; i++)
7581     {
7582       basic_block bb = bbs[i];
7583       stmt_vec_info stmt_info;
7584
7585       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7586            gsi_next (&si))
7587         {
7588           gphi *phi = si.phi ();
7589           if (dump_enabled_p ())
7590             {
7591               dump_printf_loc (MSG_NOTE, vect_location,
7592                                "------>vectorizing phi: ");
7593               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7594             }
7595           stmt_info = vinfo_for_stmt (phi);
7596           if (!stmt_info)
7597             continue;
7598
7599           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7600             vect_loop_kill_debug_uses (loop, phi);
7601
7602           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7603               && !STMT_VINFO_LIVE_P (stmt_info))
7604             continue;
7605
7606           if (STMT_VINFO_VECTYPE (stmt_info)
7607               && (maybe_ne
7608                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
7609               && dump_enabled_p ())
7610             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7611
7612           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7613                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7614                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7615               && ! PURE_SLP_STMT (stmt_info))
7616             {
7617               if (dump_enabled_p ())
7618                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7619               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7620             }
7621         }
7622
7623       pattern_stmt = NULL;
7624       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7625            !gsi_end_p (si) || transform_pattern_stmt;)
7626         {
7627           bool is_store;
7628
7629           if (transform_pattern_stmt)
7630             stmt = pattern_stmt;
7631           else
7632             {
7633               stmt = gsi_stmt (si);
7634               /* During vectorization remove existing clobber stmts.  */
7635               if (gimple_clobber_p (stmt))
7636                 {
7637                   unlink_stmt_vdef (stmt);
7638                   gsi_remove (&si, true);
7639                   release_defs (stmt);
7640                   continue;
7641                 }
7642             }
7643
7644           if (dump_enabled_p ())
7645             {
7646               dump_printf_loc (MSG_NOTE, vect_location,
7647                                "------>vectorizing statement: ");
7648               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7649             }
7650
7651           stmt_info = vinfo_for_stmt (stmt);
7652
7653           /* vector stmts created in the outer-loop during vectorization of
7654              stmts in an inner-loop may not have a stmt_info, and do not
7655              need to be vectorized.  */
7656           if (!stmt_info)
7657             {
7658               gsi_next (&si);
7659               continue;
7660             }
7661
7662           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7663             vect_loop_kill_debug_uses (loop, stmt);
7664
7665           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7666               && !STMT_VINFO_LIVE_P (stmt_info))
7667             {
7668               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7669                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7670                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7671                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7672                 {
7673                   stmt = pattern_stmt;
7674                   stmt_info = vinfo_for_stmt (stmt);
7675                 }
7676               else
7677                 {
7678                   gsi_next (&si);
7679                   continue;
7680                 }
7681             }
7682           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7683                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7684                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7685                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7686             transform_pattern_stmt = true;
7687
7688           /* If pattern statement has def stmts, vectorize them too.  */
7689           if (is_pattern_stmt_p (stmt_info))
7690             {
7691               if (pattern_def_seq == NULL)
7692                 {
7693                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7694                   pattern_def_si = gsi_start (pattern_def_seq);
7695                 }
7696               else if (!gsi_end_p (pattern_def_si))
7697                 gsi_next (&pattern_def_si);
7698               if (pattern_def_seq != NULL)
7699                 {
7700                   gimple *pattern_def_stmt = NULL;
7701                   stmt_vec_info pattern_def_stmt_info = NULL;
7702
7703                   while (!gsi_end_p (pattern_def_si))
7704                     {
7705                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7706                       pattern_def_stmt_info
7707                         = vinfo_for_stmt (pattern_def_stmt);
7708                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7709                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7710                         break;
7711                       gsi_next (&pattern_def_si);
7712                     }
7713
7714                   if (!gsi_end_p (pattern_def_si))
7715                     {
7716                       if (dump_enabled_p ())
7717                         {
7718                           dump_printf_loc (MSG_NOTE, vect_location,
7719                                            "==> vectorizing pattern def "
7720                                            "stmt: ");
7721                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7722                                             pattern_def_stmt, 0);
7723                         }
7724
7725                       stmt = pattern_def_stmt;
7726                       stmt_info = pattern_def_stmt_info;
7727                     }
7728                   else
7729                     {
7730                       pattern_def_si = gsi_none ();
7731                       transform_pattern_stmt = false;
7732                     }
7733                 }
7734               else
7735                 transform_pattern_stmt = false;
7736             }
7737
7738           if (STMT_VINFO_VECTYPE (stmt_info))
7739             {
7740               poly_uint64 nunits
7741                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7742               if (!STMT_SLP_TYPE (stmt_info)
7743                   && maybe_ne (nunits, vf)
7744                   && dump_enabled_p ())
7745                   /* For SLP VF is set according to unrolling factor, and not
7746                      to vector size, hence for SLP this print is not valid.  */
7747                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7748             }
7749
7750           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7751              reached.  */
7752           if (STMT_SLP_TYPE (stmt_info))
7753             {
7754               if (!slp_scheduled)
7755                 {
7756                   slp_scheduled = true;
7757
7758                   if (dump_enabled_p ())
7759                     dump_printf_loc (MSG_NOTE, vect_location,
7760                                      "=== scheduling SLP instances ===\n");
7761
7762                   vect_schedule_slp (loop_vinfo);
7763                 }
7764
7765               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7766               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7767                 {
7768                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7769                     {
7770                       pattern_def_seq = NULL;
7771                       gsi_next (&si);
7772                     }
7773                   continue;
7774                 }
7775             }
7776
7777           /* -------- vectorize statement ------------ */
7778           if (dump_enabled_p ())
7779             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7780
7781           grouped_store = false;
7782           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7783           if (is_store)
7784             {
7785               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7786                 {
7787                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7788                      interleaving chain was completed - free all the stores in
7789                      the chain.  */
7790                   gsi_next (&si);
7791                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7792                 }
7793               else
7794                 {
7795                   /* Free the attached stmt_vec_info and remove the stmt.  */
7796                   gimple *store = gsi_stmt (si);
7797                   free_stmt_vec_info (store);
7798                   unlink_stmt_vdef (store);
7799                   gsi_remove (&si, true);
7800                   release_defs (store);
7801                 }
7802
7803               /* Stores can only appear at the end of pattern statements.  */
7804               gcc_assert (!transform_pattern_stmt);
7805               pattern_def_seq = NULL;
7806             }
7807           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7808             {
7809               pattern_def_seq = NULL;
7810               gsi_next (&si);
7811             }
7812         }                       /* stmts in BB */
7813
7814       /* Stub out scalar statements that must not survive vectorization.
7815          Doing this here helps with grouped statements, or statements that
7816          are involved in patterns.  */
7817       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
7818            !gsi_end_p (gsi); gsi_next (&gsi))
7819         {
7820           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
7821           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
7822             {
7823               tree lhs = gimple_get_lhs (call);
7824               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7825                 {
7826                   tree zero = build_zero_cst (TREE_TYPE (lhs));
7827                   gimple *new_stmt = gimple_build_assign (lhs, zero);
7828                   gsi_replace (&gsi, new_stmt, true);
7829                 }
7830             }
7831         }
7832     }                           /* BBs in loop */
7833
7834   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
7835      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
7836   if (integer_onep (step_vector))
7837     niters_no_overflow = true;
7838   slpeel_make_loop_iterate_ntimes (loop, niters_vector, step_vector,
7839                                    niters_vector_mult_vf,
7840                                    !niters_no_overflow);
7841
7842   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
7843   scale_profile_for_vect_loop (loop, assumed_vf);
7844
7845   /* The minimum number of iterations performed by the epilogue.  This
7846      is 1 when peeling for gaps because we always need a final scalar
7847      iteration.  */
7848   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7849   /* +1 to convert latch counts to loop iteration counts,
7850      -min_epilogue_iters to remove iterations that cannot be performed
7851        by the vector code.  */
7852   int bias = 1 - min_epilogue_iters;
7853   /* In these calculations the "- 1" converts loop iteration counts
7854      back to latch counts.  */
7855   if (loop->any_upper_bound)
7856     loop->nb_iterations_upper_bound
7857       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
7858                         lowest_vf) - 1;
7859   if (loop->any_likely_upper_bound)
7860     loop->nb_iterations_likely_upper_bound
7861       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
7862                         lowest_vf) - 1;
7863   if (loop->any_estimate)
7864     loop->nb_iterations_estimate
7865       = wi::udiv_floor (loop->nb_iterations_estimate + bias,
7866                         assumed_vf) - 1;
7867
7868   if (dump_enabled_p ())
7869     {
7870       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7871         {
7872           dump_printf_loc (MSG_NOTE, vect_location,
7873                            "LOOP VECTORIZED\n");
7874           if (loop->inner)
7875             dump_printf_loc (MSG_NOTE, vect_location,
7876                              "OUTER LOOP VECTORIZED\n");
7877           dump_printf (MSG_NOTE, "\n");
7878         }
7879       else
7880         {
7881           dump_printf_loc (MSG_NOTE, vect_location,
7882                            "LOOP EPILOGUE VECTORIZED (VS=");
7883           dump_dec (MSG_NOTE, current_vector_size);
7884           dump_printf (MSG_NOTE, ")\n");
7885         }
7886     }
7887
7888   /* Free SLP instances here because otherwise stmt reference counting
7889      won't work.  */
7890   slp_instance instance;
7891   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7892     vect_free_slp_instance (instance);
7893   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7894   /* Clear-up safelen field since its value is invalid after vectorization
7895      since vectorized loop can have loop-carried dependencies.  */
7896   loop->safelen = 0;
7897
7898   /* Don't vectorize epilogue for epilogue.  */
7899   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7900     epilogue = NULL;
7901
7902   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7903     epilogue = NULL;
7904
7905   if (epilogue)
7906     {
7907       auto_vector_sizes vector_sizes;
7908       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
7909       unsigned int next_size = 0;
7910
7911       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7912           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
7913           && known_eq (vf, lowest_vf))
7914         {
7915           unsigned int eiters
7916             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
7917                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
7918           eiters = eiters % lowest_vf;
7919           epilogue->nb_iterations_upper_bound = eiters - 1;
7920
7921           unsigned int ratio;
7922           while (next_size < vector_sizes.length ()
7923                  && !(constant_multiple_p (current_vector_size,
7924                                            vector_sizes[next_size], &ratio)
7925                       && eiters >= lowest_vf / ratio))
7926             next_size += 1;
7927         }
7928       else
7929         while (next_size < vector_sizes.length ()
7930                && maybe_lt (current_vector_size, vector_sizes[next_size]))
7931           next_size += 1;
7932
7933       if (next_size == vector_sizes.length ())
7934         epilogue = NULL;
7935     }
7936
7937   if (epilogue)
7938     {
7939       epilogue->force_vectorize = loop->force_vectorize;
7940       epilogue->safelen = loop->safelen;
7941       epilogue->dont_vectorize = false;
7942
7943       /* We may need to if-convert epilogue to vectorize it.  */
7944       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7945         tree_if_conversion (epilogue);
7946     }
7947
7948   return epilogue;
7949 }
7950
7951 /* The code below is trying to perform simple optimization - revert
7952    if-conversion for masked stores, i.e. if the mask of a store is zero
7953    do not perform it and all stored value producers also if possible.
7954    For example,
7955      for (i=0; i<n; i++)
7956        if (c[i])
7957         {
7958           p1[i] += 1;
7959           p2[i] = p3[i] +2;
7960         }
7961    this transformation will produce the following semi-hammock:
7962
7963    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7964      {
7965        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7966        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7967        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7968        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7969        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7970        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7971      }
7972 */
7973
7974 void
7975 optimize_mask_stores (struct loop *loop)
7976 {
7977   basic_block *bbs = get_loop_body (loop);
7978   unsigned nbbs = loop->num_nodes;
7979   unsigned i;
7980   basic_block bb;
7981   struct loop *bb_loop;
7982   gimple_stmt_iterator gsi;
7983   gimple *stmt;
7984   auto_vec<gimple *> worklist;
7985
7986   vect_location = find_loop_location (loop);
7987   /* Pick up all masked stores in loop if any.  */
7988   for (i = 0; i < nbbs; i++)
7989     {
7990       bb = bbs[i];
7991       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7992            gsi_next (&gsi))
7993         {
7994           stmt = gsi_stmt (gsi);
7995           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7996             worklist.safe_push (stmt);
7997         }
7998     }
7999
8000   free (bbs);
8001   if (worklist.is_empty ())
8002     return;
8003
8004   /* Loop has masked stores.  */
8005   while (!worklist.is_empty ())
8006     {
8007       gimple *last, *last_store;
8008       edge e, efalse;
8009       tree mask;
8010       basic_block store_bb, join_bb;
8011       gimple_stmt_iterator gsi_to;
8012       tree vdef, new_vdef;
8013       gphi *phi;
8014       tree vectype;
8015       tree zero;
8016
8017       last = worklist.pop ();
8018       mask = gimple_call_arg (last, 2);
8019       bb = gimple_bb (last);
8020       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8021          the same loop as if_bb.  It could be different to LOOP when two
8022          level loop-nest is vectorized and mask_store belongs to the inner
8023          one.  */
8024       e = split_block (bb, last);
8025       bb_loop = bb->loop_father;
8026       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8027       join_bb = e->dest;
8028       store_bb = create_empty_bb (bb);
8029       add_bb_to_loop (store_bb, bb_loop);
8030       e->flags = EDGE_TRUE_VALUE;
8031       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8032       /* Put STORE_BB to likely part.  */
8033       efalse->probability = profile_probability::unlikely ();
8034       store_bb->count = efalse->count ();
8035       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8036       if (dom_info_available_p (CDI_DOMINATORS))
8037         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8038       if (dump_enabled_p ())
8039         dump_printf_loc (MSG_NOTE, vect_location,
8040                          "Create new block %d to sink mask stores.",
8041                          store_bb->index);
8042       /* Create vector comparison with boolean result.  */
8043       vectype = TREE_TYPE (mask);
8044       zero = build_zero_cst (vectype);
8045       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8046       gsi = gsi_last_bb (bb);
8047       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8048       /* Create new PHI node for vdef of the last masked store:
8049          .MEM_2 = VDEF <.MEM_1>
8050          will be converted to
8051          .MEM.3 = VDEF <.MEM_1>
8052          and new PHI node will be created in join bb
8053          .MEM_2 = PHI <.MEM_1, .MEM_3>
8054       */
8055       vdef = gimple_vdef (last);
8056       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8057       gimple_set_vdef (last, new_vdef);
8058       phi = create_phi_node (vdef, join_bb);
8059       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8060
8061       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8062       while (true)
8063         {
8064           gimple_stmt_iterator gsi_from;
8065           gimple *stmt1 = NULL;
8066
8067           /* Move masked store to STORE_BB.  */
8068           last_store = last;
8069           gsi = gsi_for_stmt (last);
8070           gsi_from = gsi;
8071           /* Shift GSI to the previous stmt for further traversal.  */
8072           gsi_prev (&gsi);
8073           gsi_to = gsi_start_bb (store_bb);
8074           gsi_move_before (&gsi_from, &gsi_to);
8075           /* Setup GSI_TO to the non-empty block start.  */
8076           gsi_to = gsi_start_bb (store_bb);
8077           if (dump_enabled_p ())
8078             {
8079               dump_printf_loc (MSG_NOTE, vect_location,
8080                                "Move stmt to created bb\n");
8081               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8082             }
8083           /* Move all stored value producers if possible.  */
8084           while (!gsi_end_p (gsi))
8085             {
8086               tree lhs;
8087               imm_use_iterator imm_iter;
8088               use_operand_p use_p;
8089               bool res;
8090
8091               /* Skip debug statements.  */
8092               if (is_gimple_debug (gsi_stmt (gsi)))
8093                 {
8094                   gsi_prev (&gsi);
8095                   continue;
8096                 }
8097               stmt1 = gsi_stmt (gsi);
8098               /* Do not consider statements writing to memory or having
8099                  volatile operand.  */
8100               if (gimple_vdef (stmt1)
8101                   || gimple_has_volatile_ops (stmt1))
8102                 break;
8103               gsi_from = gsi;
8104               gsi_prev (&gsi);
8105               lhs = gimple_get_lhs (stmt1);
8106               if (!lhs)
8107                 break;
8108
8109               /* LHS of vectorized stmt must be SSA_NAME.  */
8110               if (TREE_CODE (lhs) != SSA_NAME)
8111                 break;
8112
8113               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8114                 {
8115                   /* Remove dead scalar statement.  */
8116                   if (has_zero_uses (lhs))
8117                     {
8118                       gsi_remove (&gsi_from, true);
8119                       continue;
8120                     }
8121                 }
8122
8123               /* Check that LHS does not have uses outside of STORE_BB.  */
8124               res = true;
8125               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8126                 {
8127                   gimple *use_stmt;
8128                   use_stmt = USE_STMT (use_p);
8129                   if (is_gimple_debug (use_stmt))
8130                     continue;
8131                   if (gimple_bb (use_stmt) != store_bb)
8132                     {
8133                       res = false;
8134                       break;
8135                     }
8136                 }
8137               if (!res)
8138                 break;
8139
8140               if (gimple_vuse (stmt1)
8141                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8142                 break;
8143
8144               /* Can move STMT1 to STORE_BB.  */
8145               if (dump_enabled_p ())
8146                 {
8147                   dump_printf_loc (MSG_NOTE, vect_location,
8148                                    "Move stmt to created bb\n");
8149                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8150                 }
8151               gsi_move_before (&gsi_from, &gsi_to);
8152               /* Shift GSI_TO for further insertion.  */
8153               gsi_prev (&gsi_to);
8154             }
8155           /* Put other masked stores with the same mask to STORE_BB.  */
8156           if (worklist.is_empty ()
8157               || gimple_call_arg (worklist.last (), 2) != mask
8158               || worklist.last () != stmt1)
8159             break;
8160           last = worklist.pop ();
8161         }
8162       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8163     }
8164 }