gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55
  56 /* Loop Vectorization Pass.
  57
  58    This pass tries to vectorize loops.
  59
  60    For example, the vectorizer transforms the following simple loop:
  61
  62         short a[N]; short b[N]; short c[N]; int i;
  63
  64         for (i=0; i<N; i++){
  65           a[i] = b[i] + c[i];
  66         }
  67
  68    as if it was manually vectorized by rewriting the source code into:
  69
  70         typedef int __attribute__((mode(V8HI))) v8hi;
  71         short a[N];  short b[N]; short c[N];   int i;
  72         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  73         v8hi va, vb, vc;
  74
  75         for (i=0; i<N/8; i++){
  76           vb = pb[i];
  77           vc = pc[i];
  78           va = vb + vc;
  79           pa[i] = va;
  80         }
  81
  82         The main entry to this pass is vectorize_loops(), in which
  83    the vectorizer applies a set of analyses on a given set of loops,
  84    followed by the actual vectorization transformation for the loops that
  85    had successfully passed the analysis phase.
  86         Throughout this pass we make a distinction between two types of
  87    data: scalars (which are represented by SSA_NAMES), and memory references
  88    ("data-refs").  These two types of data require different handling both
  89    during analysis and transformation. The types of data-refs that the
  90    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  91    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  92    accesses are required to have a simple (consecutive) access pattern.
  93
  94    Analysis phase:
  95    ===============
  96         The driver for the analysis phase is vect_analyze_loop().
  97    It applies a set of analyses, some of which rely on the scalar evolution
  98    analyzer (scev) developed by Sebastian Pop.
  99
 100         During the analysis phase the vectorizer records some information
 101    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 102    loop, as well as general information about the loop as a whole, which is
 103    recorded in a "loop_vec_info" struct attached to each loop.
 104
 105    Transformation phase:
 106    =====================
 107         The loop transformation phase scans all the stmts in the loop, and
 108    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 109    the loop that needs to be vectorized.  It inserts the vector code sequence
 110    just before the scalar stmt S, and records a pointer to the vector code
 111    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 112    attached to S).  This pointer will be used for the vectorization of following
 113    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 114    otherwise, we rely on dead code elimination for removing it.
 115
 116         For example, say stmt S1 was vectorized into stmt VS1:
 117
 118    VS1: vb = px[i];
 119    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 120    S2:  a = b;
 121
 122    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 123    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 124    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 125    resulting sequence would be:
 126
 127    VS1: vb = px[i];
 128    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 129    VS2: va = vb;
 130    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 131
 132         Operands that are not SSA_NAMEs, are data-refs that appear in
 133    load/store operations (like 'x[i]' in S1), and are handled differently.
 134
 135    Target modeling:
 136    =================
 137         Currently the only target specific information that is used is the
 138    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 139    Targets that can support different sizes of vectors, for now will need
 140    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 141    flexibility will be added in the future.
 142
 143         Since we only vectorize operations which vector form can be
 144    expressed using existing tree codes, to verify that an operation is
 145    supported, the vectorizer checks the relevant optab at the relevant
 146    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 147    the value found is CODE_FOR_nothing, then there's no target support, and
 148    we can't vectorize the stmt.
 149
 150    For additional information on this project see:
 151    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 152 */
 153
 154 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 155
 156 /* Function vect_determine_vectorization_factor
 157
 158    Determine the vectorization factor (VF).  VF is the number of data elements
 159    that are operated upon in parallel in a single iteration of the vectorized
 160    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 161    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 162    elements can fit in a single vector register.
 163
 164    We currently support vectorization of loops in which all types operated upon
 165    are of the same size.  Therefore this function currently sets VF according to
 166    the size of the types operated upon, and fails if there are multiple sizes
 167    in the loop.
 168
 169    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 170    original loop:
 171         for (i=0; i<N; i++){
 172           a[i] = b[i] + c[i];
 173         }
 174
 175    vectorized loop:
 176         for (i=0; i<N; i+=VF){
 177           a[i:VF] = b[i:VF] + c[i:VF];
 178         }
 179 */
 180
 181 static bool
 182 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 183 {
 184   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 185   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 186   unsigned nbbs = loop->num_nodes;
 187   unsigned int vectorization_factor = 0;
 188   tree scalar_type = NULL_TREE;
 189   gphi *phi;
 190   tree vectype;
 191   unsigned int nunits;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 261               if (dump_enabled_p ())
 262                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 263                                  nunits);
 264
 265               if (!vectorization_factor
 266                   || (nunits > vectorization_factor))
 267                 vectorization_factor = nunits;
 268             }
 269         }
 270
 271       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 272            !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288             }
 289
 290           gcc_assert (stmt_info);
 291
 292           /* Skip stmts which do not need to be vectorized.  */
 293           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 294                && !STMT_VINFO_LIVE_P (stmt_info))
 295               || gimple_clobber_p (stmt))
 296             {
 297               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 298                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 299                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 300                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 301                 {
 302                   stmt = pattern_stmt;
 303                   stmt_info = vinfo_for_stmt (pattern_stmt);
 304                   if (dump_enabled_p ())
 305                     {
 306                       dump_printf_loc (MSG_NOTE, vect_location,
 307                                        "==> examining pattern statement: ");
 308                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 309                     }
 310                 }
 311               else
 312                 {
 313                   if (dump_enabled_p ())
 314                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 315                   gsi_next (&si);
 316                   continue;
 317                 }
 318             }
 319           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 320                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 321                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 322                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 323             analyze_pattern_stmt = true;
 324
 325           /* If a pattern statement has def stmts, analyze them too.  */
 326           if (is_pattern_stmt_p (stmt_info))
 327             {
 328               if (pattern_def_seq == NULL)
 329                 {
 330                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 331                   pattern_def_si = gsi_start (pattern_def_seq);
 332                 }
 333               else if (!gsi_end_p (pattern_def_si))
 334                 gsi_next (&pattern_def_si);
 335               if (pattern_def_seq != NULL)
 336                 {
 337                   gimple *pattern_def_stmt = NULL;
 338                   stmt_vec_info pattern_def_stmt_info = NULL;
 339
 340                   while (!gsi_end_p (pattern_def_si))
 341                     {
 342                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 343                       pattern_def_stmt_info
 344                         = vinfo_for_stmt (pattern_def_stmt);
 345                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 346                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 347                         break;
 348                       gsi_next (&pattern_def_si);
 349                     }
 350
 351                   if (!gsi_end_p (pattern_def_si))
 352                     {
 353                       if (dump_enabled_p ())
 354                         {
 355                           dump_printf_loc (MSG_NOTE, vect_location,
 356                                            "==> examining pattern def stmt: ");
 357                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 358                                             pattern_def_stmt, 0);
 359                         }
 360
 361                       stmt = pattern_def_stmt;
 362                       stmt_info = pattern_def_stmt_info;
 363                     }
 364                   else
 365                     {
 366                       pattern_def_si = gsi_none ();
 367                       analyze_pattern_stmt = false;
 368                     }
 369                 }
 370               else
 371                 analyze_pattern_stmt = false;
 372             }
 373
 374           if (gimple_get_lhs (stmt) == NULL_TREE
 375               /* MASK_STORE has no lhs, but is ok.  */
 376               && (!is_gimple_call (stmt)
 377                   || !gimple_call_internal_p (stmt)
 378                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 379             {
 380               if (is_gimple_call (stmt))
 381                 {
 382                   /* Ignore calls with no lhs.  These must be calls to
 383                      #pragma omp simd functions, and what vectorization factor
 384                      it really needs can't be determined until
 385                      vectorizable_simd_clone_call.  */
 386                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 387                     {
 388                       pattern_def_seq = NULL;
 389                       gsi_next (&si);
 390                     }
 391                   continue;
 392                 }
 393               if (dump_enabled_p ())
 394                 {
 395                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 396                                    "not vectorized: irregular stmt.");
 397                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 398                                     0);
 399                 }
 400               return false;
 401             }
 402
 403           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 404             {
 405               if (dump_enabled_p ())
 406                 {
 407                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                                    "not vectorized: vector stmt in loop:");
 409                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 431                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 432               else
 433                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 434
 435               /* Bool ops don't participate in vectorization factor
 436                  computation.  For comparison use compared types to
 437                  compute a factor.  */
 438               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 439                   && is_gimple_assign (stmt)
 440                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 441                 {
 442                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 443                       || STMT_VINFO_LIVE_P (stmt_info))
 444                     mask_producers.safe_push (stmt_info);
 445                   bool_result = true;
 446
 447                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 448                       == tcc_comparison
 449                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 450                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 451                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 452                   else
 453                     {
 454                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 455                         {
 456                           pattern_def_seq = NULL;
 457                           gsi_next (&si);
 458                         }
 459                       continue;
 460                     }
 461                 }
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location,
 466                                    "get vectype for scalar type:  ");
 467                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 468                   dump_printf (MSG_NOTE, "\n");
 469                 }
 470               vectype = get_vectype_for_scalar_type (scalar_type);
 471               if (!vectype)
 472                 {
 473                   if (dump_enabled_p ())
 474                     {
 475                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 476                                        "not vectorized: unsupported "
 477                                        "data-type ");
 478                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 479                                          scalar_type);
 480                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 481                     }
 482                   return false;
 483                 }
 484
 485               if (!bool_result)
 486                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 487
 488               if (dump_enabled_p ())
 489                 {
 490                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 491                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 492                   dump_printf (MSG_NOTE, "\n");
 493                 }
 494             }
 495
 496           /* Don't try to compute VF out scalar types if we stmt
 497              produces boolean vector.  Use result vectype instead.  */
 498           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 499             vf_vectype = vectype;
 500           else
 501             {
 502               /* The vectorization factor is according to the smallest
 503                  scalar type (or the largest vector size, but we only
 504                  support one vector size per loop).  */
 505               if (!bool_result)
 506                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                              &dummy);
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_NOTE, vect_location,
 511                                    "get vectype for scalar type:  ");
 512                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513                   dump_printf (MSG_NOTE, "\n");
 514                 }
 515               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516             }
 517           if (!vf_vectype)
 518             {
 519               if (dump_enabled_p ())
 520                 {
 521                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 522                                    "not vectorized: unsupported data-type ");
 523                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 524                                      scalar_type);
 525                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 526                 }
 527               return false;
 528             }
 529
 530           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 531                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 532             {
 533               if (dump_enabled_p ())
 534                 {
 535                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                                    "not vectorized: different sized vector "
 537                                    "types in statement, ");
 538                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 539                                      vectype);
 540                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 541                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 542                                      vf_vectype);
 543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 544                 }
 545               return false;
 546             }
 547
 548           if (dump_enabled_p ())
 549             {
 550               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 551               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 552               dump_printf (MSG_NOTE, "\n");
 553             }
 554
 555           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 556           if (dump_enabled_p ())
 557             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 558           if (!vectorization_factor
 559               || (nunits > vectorization_factor))
 560             vectorization_factor = nunits;
 561
 562           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 563             {
 564               pattern_def_seq = NULL;
 565               gsi_next (&si);
 566             }
 567         }
 568     }
 569
 570   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 571   if (dump_enabled_p ())
 572     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 573                      vectorization_factor);
 574   if (vectorization_factor <= 1)
 575     {
 576       if (dump_enabled_p ())
 577         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 578                          "not vectorized: unsupported data-type\n");
 579       return false;
 580     }
 581   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 582
 583   for (i = 0; i < mask_producers.length (); i++)
 584     {
 585       tree mask_type = NULL;
 586
 587       stmt = STMT_VINFO_STMT (mask_producers[i]);
 588
 589       if (is_gimple_assign (stmt)
 590           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 591           && !VECT_SCALAR_BOOLEAN_TYPE_P
 592                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 593         {
 594           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 595           mask_type = get_mask_type_for_scalar_type (scalar_type);
 596
 597           if (!mask_type)
 598             {
 599               if (dump_enabled_p ())
 600                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 601                                  "not vectorized: unsupported mask\n");
 602               return false;
 603             }
 604         }
 605       else
 606         {
 607           tree rhs;
 608           ssa_op_iter iter;
 609           gimple *def_stmt;
 610           enum vect_def_type dt;
 611
 612           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 613             {
 614               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 615                                        &def_stmt, &dt, &vectype))
 616                 {
 617                   if (dump_enabled_p ())
 618                     {
 619                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 620                                        "not vectorized: can't compute mask type "
 621                                        "for statement, ");
 622                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 623                                         0);
 624                     }
 625                   return false;
 626                 }
 627
 628               /* No vectype probably means external definition.
 629                  Allow it in case there is another operand which
 630                  allows to determine mask type.  */
 631               if (!vectype)
 632                 continue;
 633
 634               if (!mask_type)
 635                 mask_type = vectype;
 636               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 637                        != TYPE_VECTOR_SUBPARTS (vectype))
 638                 {
 639                   if (dump_enabled_p ())
 640                     {
 641                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 642                                        "not vectorized: different sized masks "
 643                                        "types in statement, ");
 644                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 645                                          mask_type);
 646                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 647                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 648                                          vectype);
 649                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 650                     }
 651                   return false;
 652                 }
 653               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 654                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 655                 {
 656                   if (dump_enabled_p ())
 657                     {
 658                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 659                                        "not vectorized: mixed mask and "
 660                                        "nonmask vector types in statement, ");
 661                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 662                                          mask_type);
 663                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 664                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 665                                          vectype);
 666                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 667                     }
 668                   return false;
 669                 }
 670             }
 671
 672           /* We may compare boolean value loaded as vector of integers.
 673              Fix mask_type in such case.  */
 674           if (mask_type
 675               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 676               && gimple_code (stmt) == GIMPLE_ASSIGN
 677               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 678             mask_type = build_same_sized_truth_vector_type (mask_type);
 679         }
 680
 681       /* No mask_type should mean loop invariant predicate.
 682          This is probably a subject for optimization in
 683          if-conversion.  */
 684       if (!mask_type)
 685         {
 686           if (dump_enabled_p ())
 687             {
 688               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 689                                "not vectorized: can't compute mask type "
 690                                "for statement, ");
 691               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 692                                 0);
 693             }
 694           return false;
 695         }
 696
 697       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 698     }
 699
 700   return true;
 701 }
 702
 703
 704 /* Function vect_is_simple_iv_evolution.
 705
 706    FORNOW: A simple evolution of an induction variables in the loop is
 707    considered a polynomial evolution.  */
 708
 709 static bool
 710 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 711                              tree * step)
 712 {
 713   tree init_expr;
 714   tree step_expr;
 715   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 716   basic_block bb;
 717
 718   /* When there is no evolution in this loop, the evolution function
 719      is not "simple".  */
 720   if (evolution_part == NULL_TREE)
 721     return false;
 722
 723   /* When the evolution is a polynomial of degree >= 2
 724      the evolution function is not "simple".  */
 725   if (tree_is_chrec (evolution_part))
 726     return false;
 727
 728   step_expr = evolution_part;
 729   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 730
 731   if (dump_enabled_p ())
 732     {
 733       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 735       dump_printf (MSG_NOTE, ",  init: ");
 736       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 737       dump_printf (MSG_NOTE, "\n");
 738     }
 739
 740   *init = init_expr;
 741   *step = step_expr;
 742
 743   if (TREE_CODE (step_expr) != INTEGER_CST
 744       && (TREE_CODE (step_expr) != SSA_NAME
 745           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 746               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 747           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 748               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 749                   || !flag_associative_math)))
 750       && (TREE_CODE (step_expr) != REAL_CST
 751           || !flag_associative_math))
 752     {
 753       if (dump_enabled_p ())
 754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 755                          "step unknown.\n");
 756       return false;
 757     }
 758
 759   return true;
 760 }
 761
 762 /* Function vect_analyze_scalar_cycles_1.
 763
 764    Examine the cross iteration def-use cycles of scalar variables
 765    in LOOP.  LOOP_VINFO represents the loop that is now being
 766    considered for vectorization (can be LOOP, or an outer-loop
 767    enclosing LOOP).  */
 768
 769 static void
 770 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 771 {
 772   basic_block bb = loop->header;
 773   tree init, step;
 774   auto_vec<gimple *, 64> worklist;
 775   gphi_iterator gsi;
 776   bool double_reduc;
 777
 778   if (dump_enabled_p ())
 779     dump_printf_loc (MSG_NOTE, vect_location,
 780                      "=== vect_analyze_scalar_cycles ===\n");
 781
 782   /* First - identify all inductions.  Reduction detection assumes that all the
 783      inductions have been identified, therefore, this order must not be
 784      changed.  */
 785   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 786     {
 787       gphi *phi = gsi.phi ();
 788       tree access_fn = NULL;
 789       tree def = PHI_RESULT (phi);
 790       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 791
 792       if (dump_enabled_p ())
 793         {
 794           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 795           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 796         }
 797
 798       /* Skip virtual phi's.  The data dependences that are associated with
 799          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 800       if (virtual_operand_p (def))
 801         continue;
 802
 803       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 804
 805       /* Analyze the evolution function.  */
 806       access_fn = analyze_scalar_evolution (loop, def);
 807       if (access_fn)
 808         {
 809           STRIP_NOPS (access_fn);
 810           if (dump_enabled_p ())
 811             {
 812               dump_printf_loc (MSG_NOTE, vect_location,
 813                                "Access function of PHI: ");
 814               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 815               dump_printf (MSG_NOTE, "\n");
 816             }
 817           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 818             = initial_condition_in_loop_num (access_fn, loop->num);
 819           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 820             = evolution_part_in_loop_num (access_fn, loop->num);
 821         }
 822
 823       if (!access_fn
 824           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 825           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 826               && TREE_CODE (step) != INTEGER_CST))
 827         {
 828           worklist.safe_push (phi);
 829           continue;
 830         }
 831
 832       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 833                   != NULL_TREE);
 834       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 835
 836       if (dump_enabled_p ())
 837         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 838       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 839     }
 840
 841
 842   /* Second - identify all reductions and nested cycles.  */
 843   while (worklist.length () > 0)
 844     {
 845       gimple *phi = worklist.pop ();
 846       tree def = PHI_RESULT (phi);
 847       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 848       gimple *reduc_stmt;
 849
 850       if (dump_enabled_p ())
 851         {
 852           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 853           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 854         }
 855
 856       gcc_assert (!virtual_operand_p (def)
 857                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 858
 859       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 860                                                 &double_reduc, false);
 861       if (reduc_stmt)
 862         {
 863           if (double_reduc)
 864             {
 865               if (dump_enabled_p ())
 866                 dump_printf_loc (MSG_NOTE, vect_location,
 867                                  "Detected double reduction.\n");
 868
 869               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 870               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 871                                                     vect_double_reduction_def;
 872             }
 873           else
 874             {
 875               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 876                 {
 877                   if (dump_enabled_p ())
 878                     dump_printf_loc (MSG_NOTE, vect_location,
 879                                      "Detected vectorizable nested cycle.\n");
 880
 881                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 882                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 883                                                              vect_nested_cycle;
 884                 }
 885               else
 886                 {
 887                   if (dump_enabled_p ())
 888                     dump_printf_loc (MSG_NOTE, vect_location,
 889                                      "Detected reduction.\n");
 890
 891                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 892                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 893                                                            vect_reduction_def;
 894                   /* Store the reduction cycles for possible vectorization in
 895                      loop-aware SLP if it was not detected as reduction
 896                      chain.  */
 897                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 898                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 899                 }
 900             }
 901         }
 902       else
 903         if (dump_enabled_p ())
 904           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 905                            "Unknown def-use cycle pattern.\n");
 906     }
 907 }
 908
 909
 910 /* Function vect_analyze_scalar_cycles.
 911
 912    Examine the cross iteration def-use cycles of scalar variables, by
 913    analyzing the loop-header PHIs of scalar variables.  Classify each
 914    cycle as one of the following: invariant, induction, reduction, unknown.
 915    We do that for the loop represented by LOOP_VINFO, and also to its
 916    inner-loop, if exists.
 917    Examples for scalar cycles:
 918
 919    Example1: reduction:
 920
 921               loop1:
 922               for (i=0; i<N; i++)
 923                  sum += a[i];
 924
 925    Example2: induction:
 926
 927               loop2:
 928               for (i=0; i<N; i++)
 929                  a[i] = i;  */
 930
 931 static void
 932 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 933 {
 934   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 935
 936   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 937
 938   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 939      Reductions in such inner-loop therefore have different properties than
 940      the reductions in the nest that gets vectorized:
 941      1. When vectorized, they are executed in the same order as in the original
 942         scalar loop, so we can't change the order of computation when
 943         vectorizing them.
 944      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 945         current checks are too strict.  */
 946
 947   if (loop->inner)
 948     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 949 }
 950
 951 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 952
 953 static void
 954 vect_fixup_reduc_chain (gimple *stmt)
 955 {
 956   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 957   gimple *stmtp;
 958   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 959               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 960   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 961   do
 962     {
 963       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 964       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 965       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 966       if (stmt)
 967         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 968           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 969     }
 970   while (stmt);
 971   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 972 }
 973
 974 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 975
 976 static void
 977 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 978 {
 979   gimple *first;
 980   unsigned i;
 981
 982   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 983     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 984       {
 985         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 986         while (next)
 987           {
 988             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 989               break;
 990             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 991           }
 992         /* If not all stmt in the chain are patterns try to handle
 993            the chain without patterns.  */
 994         if (! next)
 995           {
 996             vect_fixup_reduc_chain (first);
 997             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 998               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 999           }
1000       }
1001 }
1002
1003 /* Function vect_get_loop_niters.
1004
1005    Determine how many iterations the loop is executed and place it
1006    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1007    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1008    niter information holds in ASSUMPTIONS.
1009
1010    Return the loop exit condition.  */
1011
1012
1013 static gcond *
1014 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1015                       tree *number_of_iterations, tree *number_of_iterationsm1)
1016 {
1017   edge exit = single_exit (loop);
1018   struct tree_niter_desc niter_desc;
1019   tree niter_assumptions, niter, may_be_zero;
1020   gcond *cond = get_loop_exit_condition (loop);
1021
1022   *assumptions = boolean_true_node;
1023   *number_of_iterationsm1 = chrec_dont_know;
1024   *number_of_iterations = chrec_dont_know;
1025   if (dump_enabled_p ())
1026     dump_printf_loc (MSG_NOTE, vect_location,
1027                      "=== get_loop_niters ===\n");
1028
1029   if (!exit)
1030     return cond;
1031
1032   niter = chrec_dont_know;
1033   may_be_zero = NULL_TREE;
1034   niter_assumptions = boolean_true_node;
1035   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1036       || chrec_contains_undetermined (niter_desc.niter))
1037     return cond;
1038
1039   niter_assumptions = niter_desc.assumptions;
1040   may_be_zero = niter_desc.may_be_zero;
1041   niter = niter_desc.niter;
1042
1043   if (may_be_zero && integer_zerop (may_be_zero))
1044     may_be_zero = NULL_TREE;
1045
1046   if (may_be_zero)
1047     {
1048       if (COMPARISON_CLASS_P (may_be_zero))
1049         {
1050           /* Try to combine may_be_zero with assumptions, this can simplify
1051              computation of niter expression.  */
1052           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1053             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1054                                              niter_assumptions,
1055                                              fold_build1 (TRUTH_NOT_EXPR,
1056                                                           boolean_type_node,
1057                                                           may_be_zero));
1058           else
1059             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1060                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1061
1062           may_be_zero = NULL_TREE;
1063         }
1064       else if (integer_nonzerop (may_be_zero))
1065         {
1066           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1067           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1068           return cond;
1069         }
1070       else
1071         return cond;
1072     }
1073
1074   *assumptions = niter_assumptions;
1075   *number_of_iterationsm1 = niter;
1076
1077   /* We want the number of loop header executions which is the number
1078      of latch executions plus one.
1079      ???  For UINT_MAX latch executions this number overflows to zero
1080      for loops like do { n++; } while (n != 0);  */
1081   if (niter && !chrec_contains_undetermined (niter))
1082     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1083                           build_int_cst (TREE_TYPE (niter), 1));
1084   *number_of_iterations = niter;
1085
1086   return cond;
1087 }
1088
1089 /* Function bb_in_loop_p
1090
1091    Used as predicate for dfs order traversal of the loop bbs.  */
1092
1093 static bool
1094 bb_in_loop_p (const_basic_block bb, const void *data)
1095 {
1096   const struct loop *const loop = (const struct loop *)data;
1097   if (flow_bb_inside_loop_p (loop, bb))
1098     return true;
1099   return false;
1100 }
1101
1102
1103 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1104    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1105
1106 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1107   : vec_info (vec_info::loop, init_cost (loop_in)),
1108     loop (loop_in),
1109     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1110     num_itersm1 (NULL_TREE),
1111     num_iters (NULL_TREE),
1112     num_iters_unchanged (NULL_TREE),
1113     num_iters_assumptions (NULL_TREE),
1114     th (0),
1115     vectorization_factor (0),
1116     max_vectorization_factor (0),
1117     unaligned_dr (NULL),
1118     peeling_for_alignment (0),
1119     ptr_mask (0),
1120     slp_unrolling_factor (1),
1121     single_scalar_iteration_cost (0),
1122     vectorizable (false),
1123     peeling_for_gaps (false),
1124     peeling_for_niter (false),
1125     operands_swapped (false),
1126     no_data_dependencies (false),
1127     has_mask_store (false),
1128     scalar_loop (NULL),
1129     orig_loop_info (NULL)
1130 {
1131   /* Create/Update stmt_info for all stmts in the loop.  */
1132   basic_block *body = get_loop_body (loop);
1133   for (unsigned int i = 0; i < loop->num_nodes; i++)
1134     {
1135       basic_block bb = body[i];
1136       gimple_stmt_iterator si;
1137
1138       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1139         {
1140           gimple *phi = gsi_stmt (si);
1141           gimple_set_uid (phi, 0);
1142           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1143         }
1144
1145       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1146         {
1147           gimple *stmt = gsi_stmt (si);
1148           gimple_set_uid (stmt, 0);
1149           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1150         }
1151     }
1152   free (body);
1153
1154   /* CHECKME: We want to visit all BBs before their successors (except for
1155      latch blocks, for which this assertion wouldn't hold).  In the simple
1156      case of the loop forms we allow, a dfs order of the BBs would the same
1157      as reversed postorder traversal, so we are safe.  */
1158
1159   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1160                                           bbs, loop->num_nodes, loop);
1161   gcc_assert (nbbs == loop->num_nodes);
1162 }
1163
1164
1165 /* Free all memory used by the _loop_vec_info, as well as all the
1166    stmt_vec_info structs of all the stmts in the loop.  */
1167
1168 _loop_vec_info::~_loop_vec_info ()
1169 {
1170   int nbbs;
1171   gimple_stmt_iterator si;
1172   int j;
1173
1174   nbbs = loop->num_nodes;
1175   for (j = 0; j < nbbs; j++)
1176     {
1177       basic_block bb = bbs[j];
1178       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1179         free_stmt_vec_info (gsi_stmt (si));
1180
1181       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1182         {
1183           gimple *stmt = gsi_stmt (si);
1184
1185           /* We may have broken canonical form by moving a constant
1186              into RHS1 of a commutative op.  Fix such occurrences.  */
1187           if (operands_swapped && is_gimple_assign (stmt))
1188             {
1189               enum tree_code code = gimple_assign_rhs_code (stmt);
1190
1191               if ((code == PLUS_EXPR
1192                    || code == POINTER_PLUS_EXPR
1193                    || code == MULT_EXPR)
1194                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1195                 swap_ssa_operands (stmt,
1196                                    gimple_assign_rhs1_ptr (stmt),
1197                                    gimple_assign_rhs2_ptr (stmt));
1198               else if (code == COND_EXPR
1199                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1200                 {
1201                   tree cond_expr = gimple_assign_rhs1 (stmt);
1202                   enum tree_code cond_code = TREE_CODE (cond_expr);
1203
1204                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1205                     {
1206                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1207                                                                   0));
1208                       cond_code = invert_tree_comparison (cond_code,
1209                                                           honor_nans);
1210                       if (cond_code != ERROR_MARK)
1211                         {
1212                           TREE_SET_CODE (cond_expr, cond_code);
1213                           swap_ssa_operands (stmt,
1214                                              gimple_assign_rhs2_ptr (stmt),
1215                                              gimple_assign_rhs3_ptr (stmt));
1216                         }
1217                     }
1218                 }
1219             }
1220
1221           /* Free stmt_vec_info.  */
1222           free_stmt_vec_info (stmt);
1223           gsi_next (&si);
1224         }
1225     }
1226
1227   free (bbs);
1228
1229   loop->aux = NULL;
1230 }
1231
1232
1233 /* Calculate the cost of one scalar iteration of the loop.  */
1234 static void
1235 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1236 {
1237   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1238   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1239   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1240   int innerloop_iters, i;
1241
1242   /* Count statements in scalar loop.  Using this as scalar cost for a single
1243      iteration for now.
1244
1245      TODO: Add outer loop support.
1246
1247      TODO: Consider assigning different costs to different scalar
1248      statements.  */
1249
1250   /* FORNOW.  */
1251   innerloop_iters = 1;
1252   if (loop->inner)
1253     innerloop_iters = 50; /* FIXME */
1254
1255   for (i = 0; i < nbbs; i++)
1256     {
1257       gimple_stmt_iterator si;
1258       basic_block bb = bbs[i];
1259
1260       if (bb->loop_father == loop->inner)
1261         factor = innerloop_iters;
1262       else
1263         factor = 1;
1264
1265       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1266         {
1267           gimple *stmt = gsi_stmt (si);
1268           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1269
1270           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1271             continue;
1272
1273           /* Skip stmts that are not vectorized inside the loop.  */
1274           if (stmt_info
1275               && !STMT_VINFO_RELEVANT_P (stmt_info)
1276               && (!STMT_VINFO_LIVE_P (stmt_info)
1277                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1278               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1279             continue;
1280
1281           vect_cost_for_stmt kind;
1282           if (STMT_VINFO_DATA_REF (stmt_info))
1283             {
1284               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1285                kind = scalar_load;
1286              else
1287                kind = scalar_store;
1288             }
1289           else
1290             kind = scalar_stmt;
1291
1292           scalar_single_iter_cost
1293             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1294                                  factor, kind, stmt_info, 0, vect_prologue);
1295         }
1296     }
1297   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1298     = scalar_single_iter_cost;
1299 }
1300
1301
1302 /* Function vect_analyze_loop_form_1.
1303
1304    Verify that certain CFG restrictions hold, including:
1305    - the loop has a pre-header
1306    - the loop has a single entry and exit
1307    - the loop exit condition is simple enough
1308    - the number of iterations can be analyzed, i.e, a countable loop.  The
1309      niter could be analyzed under some assumptions.  */
1310
1311 bool
1312 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1313                           tree *assumptions, tree *number_of_iterationsm1,
1314                           tree *number_of_iterations, gcond **inner_loop_cond)
1315 {
1316   if (dump_enabled_p ())
1317     dump_printf_loc (MSG_NOTE, vect_location,
1318                      "=== vect_analyze_loop_form ===\n");
1319
1320   /* Different restrictions apply when we are considering an inner-most loop,
1321      vs. an outer (nested) loop.
1322      (FORNOW. May want to relax some of these restrictions in the future).  */
1323
1324   if (!loop->inner)
1325     {
1326       /* Inner-most loop.  We currently require that the number of BBs is
1327          exactly 2 (the header and latch).  Vectorizable inner-most loops
1328          look like this:
1329
1330                         (pre-header)
1331                            |
1332                           header <--------+
1333                            | |            |
1334                            | +--> latch --+
1335                            |
1336                         (exit-bb)  */
1337
1338       if (loop->num_nodes != 2)
1339         {
1340           if (dump_enabled_p ())
1341             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1342                              "not vectorized: control flow in loop.\n");
1343           return false;
1344         }
1345
1346       if (empty_block_p (loop->header))
1347         {
1348           if (dump_enabled_p ())
1349             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1350                              "not vectorized: empty loop.\n");
1351           return false;
1352         }
1353     }
1354   else
1355     {
1356       struct loop *innerloop = loop->inner;
1357       edge entryedge;
1358
1359       /* Nested loop. We currently require that the loop is doubly-nested,
1360          contains a single inner loop, and the number of BBs is exactly 5.
1361          Vectorizable outer-loops look like this:
1362
1363                         (pre-header)
1364                            |
1365                           header <---+
1366                            |         |
1367                           inner-loop |
1368                            |         |
1369                           tail ------+
1370                            |
1371                         (exit-bb)
1372
1373          The inner-loop has the properties expected of inner-most loops
1374          as described above.  */
1375
1376       if ((loop->inner)->inner || (loop->inner)->next)
1377         {
1378           if (dump_enabled_p ())
1379             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1380                              "not vectorized: multiple nested loops.\n");
1381           return false;
1382         }
1383
1384       if (loop->num_nodes != 5)
1385         {
1386           if (dump_enabled_p ())
1387             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1388                              "not vectorized: control flow in loop.\n");
1389           return false;
1390         }
1391
1392       entryedge = loop_preheader_edge (innerloop);
1393       if (entryedge->src != loop->header
1394           || !single_exit (innerloop)
1395           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1396         {
1397           if (dump_enabled_p ())
1398             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1399                              "not vectorized: unsupported outerloop form.\n");
1400           return false;
1401         }
1402
1403       /* Analyze the inner-loop.  */
1404       tree inner_niterm1, inner_niter, inner_assumptions;
1405       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1406                                       &inner_assumptions, &inner_niterm1,
1407                                       &inner_niter, NULL)
1408           /* Don't support analyzing niter under assumptions for inner
1409              loop.  */
1410           || !integer_onep (inner_assumptions))
1411         {
1412           if (dump_enabled_p ())
1413             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1414                              "not vectorized: Bad inner loop.\n");
1415           return false;
1416         }
1417
1418       if (!expr_invariant_in_loop_p (loop, inner_niter))
1419         {
1420           if (dump_enabled_p ())
1421             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1422                              "not vectorized: inner-loop count not"
1423                              " invariant.\n");
1424           return false;
1425         }
1426
1427       if (dump_enabled_p ())
1428         dump_printf_loc (MSG_NOTE, vect_location,
1429                          "Considering outer-loop vectorization.\n");
1430     }
1431
1432   if (!single_exit (loop)
1433       || EDGE_COUNT (loop->header->preds) != 2)
1434     {
1435       if (dump_enabled_p ())
1436         {
1437           if (!single_exit (loop))
1438             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1439                              "not vectorized: multiple exits.\n");
1440           else if (EDGE_COUNT (loop->header->preds) != 2)
1441             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442                              "not vectorized: too many incoming edges.\n");
1443         }
1444       return false;
1445     }
1446
1447   /* We assume that the loop exit condition is at the end of the loop. i.e,
1448      that the loop is represented as a do-while (with a proper if-guard
1449      before the loop if needed), where the loop header contains all the
1450      executable statements, and the latch is empty.  */
1451   if (!empty_block_p (loop->latch)
1452       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1453     {
1454       if (dump_enabled_p ())
1455         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1456                          "not vectorized: latch block not empty.\n");
1457       return false;
1458     }
1459
1460   /* Make sure the exit is not abnormal.  */
1461   edge e = single_exit (loop);
1462   if (e->flags & EDGE_ABNORMAL)
1463     {
1464       if (dump_enabled_p ())
1465         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1466                          "not vectorized: abnormal loop exit edge.\n");
1467       return false;
1468     }
1469
1470   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1471                                      number_of_iterationsm1);
1472   if (!*loop_cond)
1473     {
1474       if (dump_enabled_p ())
1475         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1476                          "not vectorized: complicated exit condition.\n");
1477       return false;
1478     }
1479
1480   if (integer_zerop (*assumptions)
1481       || !*number_of_iterations
1482       || chrec_contains_undetermined (*number_of_iterations))
1483     {
1484       if (dump_enabled_p ())
1485         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1486                          "not vectorized: number of iterations cannot be "
1487                          "computed.\n");
1488       return false;
1489     }
1490
1491   if (integer_zerop (*number_of_iterations))
1492     {
1493       if (dump_enabled_p ())
1494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1495                          "not vectorized: number of iterations = 0.\n");
1496       return false;
1497     }
1498
1499   return true;
1500 }
1501
1502 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1503
1504 loop_vec_info
1505 vect_analyze_loop_form (struct loop *loop)
1506 {
1507   tree assumptions, number_of_iterations, number_of_iterationsm1;
1508   gcond *loop_cond, *inner_loop_cond = NULL;
1509
1510   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1511                                   &assumptions, &number_of_iterationsm1,
1512                                   &number_of_iterations, &inner_loop_cond))
1513     return NULL;
1514
1515   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1516   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1517   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1518   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1519   if (!integer_onep (assumptions))
1520     {
1521       /* We consider to vectorize this loop by versioning it under
1522          some assumptions.  In order to do this, we need to clear
1523          existing information computed by scev and niter analyzer.  */
1524       scev_reset_htab ();
1525       free_numbers_of_iterations_estimates (loop);
1526       /* Also set flag for this loop so that following scev and niter
1527          analysis are done under the assumptions.  */
1528       loop_constraint_set (loop, LOOP_C_FINITE);
1529       /* Also record the assumptions for versioning.  */
1530       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1531     }
1532
1533   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1534     {
1535       if (dump_enabled_p ())
1536         {
1537           dump_printf_loc (MSG_NOTE, vect_location,
1538                            "Symbolic number of iterations is ");
1539           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1540           dump_printf (MSG_NOTE, "\n");
1541         }
1542     }
1543
1544   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1545   if (inner_loop_cond)
1546     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1547       = loop_exit_ctrl_vec_info_type;
1548
1549   gcc_assert (!loop->aux);
1550   loop->aux = loop_vinfo;
1551   return loop_vinfo;
1552 }
1553
1554
1555
1556 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1557    statements update the vectorization factor.  */
1558
1559 static void
1560 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1561 {
1562   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1564   int nbbs = loop->num_nodes;
1565   unsigned int vectorization_factor;
1566   int i;
1567
1568   if (dump_enabled_p ())
1569     dump_printf_loc (MSG_NOTE, vect_location,
1570                      "=== vect_update_vf_for_slp ===\n");
1571
1572   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1573   gcc_assert (vectorization_factor != 0);
1574
1575   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1576      vectorization factor of the loop is the unrolling factor required by
1577      the SLP instances.  If that unrolling factor is 1, we say, that we
1578      perform pure SLP on loop - cross iteration parallelism is not
1579      exploited.  */
1580   bool only_slp_in_loop = true;
1581   for (i = 0; i < nbbs; i++)
1582     {
1583       basic_block bb = bbs[i];
1584       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1585            gsi_next (&si))
1586         {
1587           gimple *stmt = gsi_stmt (si);
1588           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1589           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1590               && STMT_VINFO_RELATED_STMT (stmt_info))
1591             {
1592               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1593               stmt_info = vinfo_for_stmt (stmt);
1594             }
1595           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1596                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1597               && !PURE_SLP_STMT (stmt_info))
1598             /* STMT needs both SLP and loop-based vectorization.  */
1599             only_slp_in_loop = false;
1600         }
1601     }
1602
1603   if (only_slp_in_loop)
1604     {
1605       dump_printf_loc (MSG_NOTE, vect_location,
1606                        "Loop contains only SLP stmts\n");
1607       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1608     }
1609   else
1610     {
1611       dump_printf_loc (MSG_NOTE, vect_location,
1612                        "Loop contains SLP and non-SLP stmts\n");
1613       vectorization_factor
1614         = least_common_multiple (vectorization_factor,
1615                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1616     }
1617
1618   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1619   if (dump_enabled_p ())
1620     dump_printf_loc (MSG_NOTE, vect_location,
1621                      "Updating vectorization factor to %d\n",
1622                      vectorization_factor);
1623 }
1624
1625 /* Function vect_analyze_loop_operations.
1626
1627    Scan the loop stmts and make sure they are all vectorizable.  */
1628
1629 static bool
1630 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1631 {
1632   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1633   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1634   int nbbs = loop->num_nodes;
1635   int i;
1636   stmt_vec_info stmt_info;
1637   bool need_to_vectorize = false;
1638   bool ok;
1639
1640   if (dump_enabled_p ())
1641     dump_printf_loc (MSG_NOTE, vect_location,
1642                      "=== vect_analyze_loop_operations ===\n");
1643
1644   for (i = 0; i < nbbs; i++)
1645     {
1646       basic_block bb = bbs[i];
1647
1648       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1649            gsi_next (&si))
1650         {
1651           gphi *phi = si.phi ();
1652           ok = true;
1653
1654           stmt_info = vinfo_for_stmt (phi);
1655           if (dump_enabled_p ())
1656             {
1657               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1658               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1659             }
1660           if (virtual_operand_p (gimple_phi_result (phi)))
1661             continue;
1662
1663           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1664              (i.e., a phi in the tail of the outer-loop).  */
1665           if (! is_loop_header_bb_p (bb))
1666             {
1667               /* FORNOW: we currently don't support the case that these phis
1668                  are not used in the outerloop (unless it is double reduction,
1669                  i.e., this phi is vect_reduction_def), cause this case
1670                  requires to actually do something here.  */
1671               if (STMT_VINFO_LIVE_P (stmt_info)
1672                   && STMT_VINFO_DEF_TYPE (stmt_info)
1673                      != vect_double_reduction_def)
1674                 {
1675                   if (dump_enabled_p ())
1676                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677                                      "Unsupported loop-closed phi in "
1678                                      "outer-loop.\n");
1679                   return false;
1680                 }
1681
1682               /* If PHI is used in the outer loop, we check that its operand
1683                  is defined in the inner loop.  */
1684               if (STMT_VINFO_RELEVANT_P (stmt_info))
1685                 {
1686                   tree phi_op;
1687                   gimple *op_def_stmt;
1688
1689                   if (gimple_phi_num_args (phi) != 1)
1690                     return false;
1691
1692                   phi_op = PHI_ARG_DEF (phi, 0);
1693                   if (TREE_CODE (phi_op) != SSA_NAME)
1694                     return false;
1695
1696                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1697                   if (gimple_nop_p (op_def_stmt)
1698                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1699                       || !vinfo_for_stmt (op_def_stmt))
1700                     return false;
1701
1702                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1703                         != vect_used_in_outer
1704                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1705                            != vect_used_in_outer_by_reduction)
1706                     return false;
1707                 }
1708
1709               continue;
1710             }
1711
1712           gcc_assert (stmt_info);
1713
1714           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1715                || STMT_VINFO_LIVE_P (stmt_info))
1716               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1717             {
1718               /* A scalar-dependence cycle that we don't support.  */
1719               if (dump_enabled_p ())
1720                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1721                                  "not vectorized: scalar dependence cycle.\n");
1722               return false;
1723             }
1724
1725           if (STMT_VINFO_RELEVANT_P (stmt_info))
1726             {
1727               need_to_vectorize = true;
1728               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1729                   && ! PURE_SLP_STMT (stmt_info))
1730                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1731               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1732                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1733                        && ! PURE_SLP_STMT (stmt_info))
1734                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1735             }
1736
1737           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1738             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1739
1740           if (!ok)
1741             {
1742               if (dump_enabled_p ())
1743                 {
1744                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1745                                    "not vectorized: relevant phi not "
1746                                    "supported: ");
1747                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1748                 }
1749               return false;
1750             }
1751         }
1752
1753       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1754            gsi_next (&si))
1755         {
1756           gimple *stmt = gsi_stmt (si);
1757           if (!gimple_clobber_p (stmt)
1758               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1759             return false;
1760         }
1761     } /* bbs */
1762
1763   /* All operations in the loop are either irrelevant (deal with loop
1764      control, or dead), or only used outside the loop and can be moved
1765      out of the loop (e.g. invariants, inductions).  The loop can be
1766      optimized away by scalar optimizations.  We're better off not
1767      touching this loop.  */
1768   if (!need_to_vectorize)
1769     {
1770       if (dump_enabled_p ())
1771         dump_printf_loc (MSG_NOTE, vect_location,
1772                          "All the computation can be taken out of the loop.\n");
1773       if (dump_enabled_p ())
1774         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1775                          "not vectorized: redundant loop. no profit to "
1776                          "vectorize.\n");
1777       return false;
1778     }
1779
1780   return true;
1781 }
1782
1783
1784 /* Function vect_analyze_loop_2.
1785
1786    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1787    for it.  The different analyses will record information in the
1788    loop_vec_info struct.  */
1789 static bool
1790 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1791 {
1792   bool ok;
1793   int max_vf = MAX_VECTORIZATION_FACTOR;
1794   int min_vf = 2;
1795   unsigned int n_stmts = 0;
1796
1797   /* The first group of checks is independent of the vector size.  */
1798   fatal = true;
1799
1800   /* Find all data references in the loop (which correspond to vdefs/vuses)
1801      and analyze their evolution in the loop.  */
1802
1803   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1804
1805   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1806   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1807     {
1808       if (dump_enabled_p ())
1809         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1810                          "not vectorized: loop nest containing two "
1811                          "or more consecutive inner loops cannot be "
1812                          "vectorized\n");
1813       return false;
1814     }
1815
1816   for (unsigned i = 0; i < loop->num_nodes; i++)
1817     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1818          !gsi_end_p (gsi); gsi_next (&gsi))
1819       {
1820         gimple *stmt = gsi_stmt (gsi);
1821         if (is_gimple_debug (stmt))
1822           continue;
1823         ++n_stmts;
1824         if (!find_data_references_in_stmt (loop, stmt,
1825                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1826           {
1827             if (is_gimple_call (stmt) && loop->safelen)
1828               {
1829                 tree fndecl = gimple_call_fndecl (stmt), op;
1830                 if (fndecl != NULL_TREE)
1831                   {
1832                     cgraph_node *node = cgraph_node::get (fndecl);
1833                     if (node != NULL && node->simd_clones != NULL)
1834                       {
1835                         unsigned int j, n = gimple_call_num_args (stmt);
1836                         for (j = 0; j < n; j++)
1837                           {
1838                             op = gimple_call_arg (stmt, j);
1839                             if (DECL_P (op)
1840                                 || (REFERENCE_CLASS_P (op)
1841                                     && get_base_address (op)))
1842                               break;
1843                           }
1844                         op = gimple_call_lhs (stmt);
1845                         /* Ignore #pragma omp declare simd functions
1846                            if they don't have data references in the
1847                            call stmt itself.  */
1848                         if (j == n
1849                             && !(op
1850                                  && (DECL_P (op)
1851                                      || (REFERENCE_CLASS_P (op)
1852                                          && get_base_address (op)))))
1853                           continue;
1854                       }
1855                   }
1856               }
1857             if (dump_enabled_p ())
1858               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1859                                "not vectorized: loop contains function "
1860                                "calls or data references that cannot "
1861                                "be analyzed\n");
1862             return false;
1863           }
1864       }
1865
1866   /* Analyze the data references and also adjust the minimal
1867      vectorization factor according to the loads and stores.  */
1868
1869   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1870   if (!ok)
1871     {
1872       if (dump_enabled_p ())
1873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874                          "bad data references.\n");
1875       return false;
1876     }
1877
1878   /* Classify all cross-iteration scalar data-flow cycles.
1879      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1880   vect_analyze_scalar_cycles (loop_vinfo);
1881
1882   vect_pattern_recog (loop_vinfo);
1883
1884   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1885
1886   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1887      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1888
1889   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1890   if (!ok)
1891     {
1892       if (dump_enabled_p ())
1893         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894                          "bad data access.\n");
1895       return false;
1896     }
1897
1898   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1899
1900   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1901   if (!ok)
1902     {
1903       if (dump_enabled_p ())
1904         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1905                          "unexpected pattern.\n");
1906       return false;
1907     }
1908
1909   /* While the rest of the analysis below depends on it in some way.  */
1910   fatal = false;
1911
1912   /* Analyze data dependences between the data-refs in the loop
1913      and adjust the maximum vectorization factor according to
1914      the dependences.
1915      FORNOW: fail at the first data dependence that we encounter.  */
1916
1917   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1918   if (!ok
1919       || max_vf < min_vf)
1920     {
1921       if (dump_enabled_p ())
1922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1923                              "bad data dependence.\n");
1924       return false;
1925     }
1926   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1927
1928   ok = vect_determine_vectorization_factor (loop_vinfo);
1929   if (!ok)
1930     {
1931       if (dump_enabled_p ())
1932         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1933                          "can't determine vectorization factor.\n");
1934       return false;
1935     }
1936   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1937     {
1938       if (dump_enabled_p ())
1939         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940                          "bad data dependence.\n");
1941       return false;
1942     }
1943
1944   /* Compute the scalar iteration cost.  */
1945   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1946
1947   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948   HOST_WIDE_INT estimated_niter;
1949   unsigned th;
1950   int min_scalar_loop_bound;
1951
1952   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1953   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1954   if (!ok)
1955     return false;
1956
1957   /* If there are any SLP instances mark them as pure_slp.  */
1958   bool slp = vect_make_slp_decision (loop_vinfo);
1959   if (slp)
1960     {
1961       /* Find stmts that need to be both vectorized and SLPed.  */
1962       vect_detect_hybrid_slp (loop_vinfo);
1963
1964       /* Update the vectorization factor based on the SLP decision.  */
1965       vect_update_vf_for_slp (loop_vinfo);
1966     }
1967
1968   /* This is the point where we can re-start analysis with SLP forced off.  */
1969 start_over:
1970
1971   /* Now the vectorization factor is final.  */
1972   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1973   gcc_assert (vectorization_factor != 0);
1974
1975   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1976     dump_printf_loc (MSG_NOTE, vect_location,
1977                      "vectorization_factor = %d, niters = "
1978                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1979                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1980
1981   HOST_WIDE_INT max_niter
1982     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1983   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1984        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1985       || (max_niter != -1
1986           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1987     {
1988       if (dump_enabled_p ())
1989         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1990                          "not vectorized: iteration count smaller than "
1991                          "vectorization factor.\n");
1992       return false;
1993     }
1994
1995   /* Analyze the alignment of the data-refs in the loop.
1996      Fail if a data reference is found that cannot be vectorized.  */
1997
1998   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1999   if (!ok)
2000     {
2001       if (dump_enabled_p ())
2002         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003                          "bad data alignment.\n");
2004       return false;
2005     }
2006
2007   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2008      It is important to call pruning after vect_analyze_data_ref_accesses,
2009      since we use grouping information gathered by interleaving analysis.  */
2010   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2011   if (!ok)
2012     return false;
2013
2014   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2015      vectorization.  */
2016   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2017     {
2018     /* This pass will decide on using loop versioning and/or loop peeling in
2019        order to enhance the alignment of data references in the loop.  */
2020     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2021     if (!ok)
2022       {
2023         if (dump_enabled_p ())
2024           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2025                            "bad data alignment.\n");
2026         return false;
2027       }
2028     }
2029
2030   if (slp)
2031     {
2032       /* Analyze operations in the SLP instances.  Note this may
2033          remove unsupported SLP instances which makes the above
2034          SLP kind detection invalid.  */
2035       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2036       vect_slp_analyze_operations (loop_vinfo);
2037       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2038         goto again;
2039     }
2040
2041   /* Scan all the remaining operations in the loop that are not subject
2042      to SLP and make sure they are vectorizable.  */
2043   ok = vect_analyze_loop_operations (loop_vinfo);
2044   if (!ok)
2045     {
2046       if (dump_enabled_p ())
2047         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2048                          "bad operation or unsupported loop bound.\n");
2049       return false;
2050     }
2051
2052   /* If epilog loop is required because of data accesses with gaps,
2053      one additional iteration needs to be peeled.  Check if there is
2054      enough iterations for vectorization.  */
2055   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2056       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2057     {
2058       int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2059       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2060
2061       if (wi::to_widest (scalar_niters) < vf)
2062         {
2063           if (dump_enabled_p ())
2064             dump_printf_loc (MSG_NOTE, vect_location,
2065                              "loop has no enough iterations to support"
2066                              " peeling for gaps.\n");
2067           return false;
2068         }
2069     }
2070
2071   /* Analyze cost.  Decide if worth while to vectorize.  */
2072   int min_profitable_estimate, min_profitable_iters;
2073   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2074                                       &min_profitable_estimate);
2075
2076   if (min_profitable_iters < 0)
2077     {
2078       if (dump_enabled_p ())
2079         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2080                          "not vectorized: vectorization not profitable.\n");
2081       if (dump_enabled_p ())
2082         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2083                          "not vectorized: vector version will never be "
2084                          "profitable.\n");
2085       goto again;
2086     }
2087
2088   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2089                            * vectorization_factor);
2090
2091   /* Use the cost model only if it is more conservative than user specified
2092      threshold.  */
2093   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2094
2095   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2096
2097   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2098       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2099     {
2100       if (dump_enabled_p ())
2101         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2102                          "not vectorized: vectorization not profitable.\n");
2103       if (dump_enabled_p ())
2104         dump_printf_loc (MSG_NOTE, vect_location,
2105                          "not vectorized: iteration count smaller than user "
2106                          "specified loop bound parameter or minimum profitable "
2107                          "iterations (whichever is more conservative).\n");
2108       goto again;
2109     }
2110
2111   estimated_niter
2112     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2113   if (estimated_niter == -1)
2114     estimated_niter = max_niter;
2115   if (estimated_niter != -1
2116       && ((unsigned HOST_WIDE_INT) estimated_niter
2117           < MAX (th, (unsigned) min_profitable_estimate)))
2118     {
2119       if (dump_enabled_p ())
2120         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2121                          "not vectorized: estimated iteration count too "
2122                          "small.\n");
2123       if (dump_enabled_p ())
2124         dump_printf_loc (MSG_NOTE, vect_location,
2125                          "not vectorized: estimated iteration count smaller "
2126                          "than specified loop bound parameter or minimum "
2127                          "profitable iterations (whichever is more "
2128                          "conservative).\n");
2129       goto again;
2130     }
2131
2132   /* Decide whether we need to create an epilogue loop to handle
2133      remaining scalar iterations.  */
2134   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2135          / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2136         * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2137
2138   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2139       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2140     {
2141       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2142                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2143           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2144         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2145     }
2146   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2147            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2148                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2149                /* In case of versioning, check if the maximum number of
2150                   iterations is greater than th.  If they are identical,
2151                   the epilogue is unnecessary.  */
2152                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2153                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2154     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2155
2156   /* If an epilogue loop is required make sure we can create one.  */
2157   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2158       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2159     {
2160       if (dump_enabled_p ())
2161         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2162       if (!vect_can_advance_ivs_p (loop_vinfo)
2163           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2164                                            single_exit (LOOP_VINFO_LOOP
2165                                                          (loop_vinfo))))
2166         {
2167           if (dump_enabled_p ())
2168             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2169                              "not vectorized: can't create required "
2170                              "epilog loop\n");
2171           goto again;
2172         }
2173     }
2174
2175   /* During peeling, we need to check if number of loop iterations is
2176      enough for both peeled prolog loop and vector loop.  This check
2177      can be merged along with threshold check of loop versioning, so
2178      increase threshold for this case if necessary.  */
2179   if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2180       && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2181           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2182     {
2183       unsigned niters_th;
2184
2185       /* Niters for peeled prolog loop.  */
2186       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2187         {
2188           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2189           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2190
2191           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2192         }
2193       else
2194         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2195
2196       /* Niters for at least one iteration of vectorized loop.  */
2197       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2198       /* One additional iteration because of peeling for gap.  */
2199       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2200         niters_th++;
2201       if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2202         LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2203     }
2204
2205   gcc_assert (vectorization_factor
2206               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2207
2208   /* Ok to vectorize!  */
2209   return true;
2210
2211 again:
2212   /* Try again with SLP forced off but if we didn't do any SLP there is
2213      no point in re-trying.  */
2214   if (!slp)
2215     return false;
2216
2217   /* If there are reduction chains re-trying will fail anyway.  */
2218   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2219     return false;
2220
2221   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2222      via interleaving or lane instructions.  */
2223   slp_instance instance;
2224   slp_tree node;
2225   unsigned i, j;
2226   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2227     {
2228       stmt_vec_info vinfo;
2229       vinfo = vinfo_for_stmt
2230           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2231       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2232         continue;
2233       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2234       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2235       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2236       if (! vect_store_lanes_supported (vectype, size)
2237           && ! vect_grouped_store_supported (vectype, size))
2238         return false;
2239       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2240         {
2241           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2242           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2243           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2244           size = STMT_VINFO_GROUP_SIZE (vinfo);
2245           vectype = STMT_VINFO_VECTYPE (vinfo);
2246           if (! vect_load_lanes_supported (vectype, size)
2247               && ! vect_grouped_load_supported (vectype, single_element_p,
2248                                                 size))
2249             return false;
2250         }
2251     }
2252
2253   if (dump_enabled_p ())
2254     dump_printf_loc (MSG_NOTE, vect_location,
2255                      "re-trying with SLP disabled\n");
2256
2257   /* Roll back state appropriately.  No SLP this time.  */
2258   slp = false;
2259   /* Restore vectorization factor as it were without SLP.  */
2260   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2261   /* Free the SLP instances.  */
2262   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2263     vect_free_slp_instance (instance);
2264   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2265   /* Reset SLP type to loop_vect on all stmts.  */
2266   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2267     {
2268       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2269       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2270            !gsi_end_p (si); gsi_next (&si))
2271         {
2272           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2273           STMT_SLP_TYPE (stmt_info) = loop_vect;
2274         }
2275       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2276            !gsi_end_p (si); gsi_next (&si))
2277         {
2278           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2279           STMT_SLP_TYPE (stmt_info) = loop_vect;
2280           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2281             {
2282               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2283               STMT_SLP_TYPE (stmt_info) = loop_vect;
2284               for (gimple_stmt_iterator pi
2285                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2286                    !gsi_end_p (pi); gsi_next (&pi))
2287                 {
2288                   gimple *pstmt = gsi_stmt (pi);
2289                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2290                 }
2291             }
2292         }
2293     }
2294   /* Free optimized alias test DDRS.  */
2295   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2296   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2297   /* Reset target cost data.  */
2298   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2299   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2300     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2301   /* Reset assorted flags.  */
2302   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2303   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2304   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2305
2306   goto start_over;
2307 }
2308
2309 /* Function vect_analyze_loop.
2310
2311    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2312    for it.  The different analyses will record information in the
2313    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2314    be vectorized.  */
2315 loop_vec_info
2316 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2317 {
2318   loop_vec_info loop_vinfo;
2319   unsigned int vector_sizes;
2320
2321   /* Autodetect first vector size we try.  */
2322   current_vector_size = 0;
2323   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2324
2325   if (dump_enabled_p ())
2326     dump_printf_loc (MSG_NOTE, vect_location,
2327                      "===== analyze_loop_nest =====\n");
2328
2329   if (loop_outer (loop)
2330       && loop_vec_info_for_loop (loop_outer (loop))
2331       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2332     {
2333       if (dump_enabled_p ())
2334         dump_printf_loc (MSG_NOTE, vect_location,
2335                          "outer-loop already vectorized.\n");
2336       return NULL;
2337     }
2338
2339   while (1)
2340     {
2341       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2342       loop_vinfo = vect_analyze_loop_form (loop);
2343       if (!loop_vinfo)
2344         {
2345           if (dump_enabled_p ())
2346             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2347                              "bad loop form.\n");
2348           return NULL;
2349         }
2350
2351       bool fatal = false;
2352
2353       if (orig_loop_vinfo)
2354         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2355
2356       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2357         {
2358           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2359
2360           return loop_vinfo;
2361         }
2362
2363       delete loop_vinfo;
2364
2365       vector_sizes &= ~current_vector_size;
2366       if (fatal
2367           || vector_sizes == 0
2368           || current_vector_size == 0)
2369         return NULL;
2370
2371       /* Try the next biggest vector size.  */
2372       current_vector_size = 1 << floor_log2 (vector_sizes);
2373       if (dump_enabled_p ())
2374         dump_printf_loc (MSG_NOTE, vect_location,
2375                          "***** Re-trying analysis with "
2376                          "vector size %d\n", current_vector_size);
2377     }
2378 }
2379
2380
2381 /* Function reduction_fn_for_scalar_code
2382
2383    Input:
2384    CODE - tree_code of a reduction operations.
2385
2386    Output:
2387    REDUC_FN - the corresponding internal function to be used to reduce the
2388       vector of partial results into a single scalar result, or IFN_LAST
2389       if the operation is a supported reduction operation, but does not have
2390       such an internal function.
2391
2392    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2393
2394 static bool
2395 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2396 {
2397   switch (code)
2398     {
2399       case MAX_EXPR:
2400         *reduc_fn = IFN_REDUC_MAX;
2401         return true;
2402
2403       case MIN_EXPR:
2404         *reduc_fn = IFN_REDUC_MIN;
2405         return true;
2406
2407       case PLUS_EXPR:
2408         *reduc_fn = IFN_REDUC_PLUS;
2409         return true;
2410
2411       case MULT_EXPR:
2412       case MINUS_EXPR:
2413       case BIT_IOR_EXPR:
2414       case BIT_XOR_EXPR:
2415       case BIT_AND_EXPR:
2416         *reduc_fn = IFN_LAST;
2417         return true;
2418
2419       default:
2420        return false;
2421     }
2422 }
2423
2424
2425 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2426    STMT is printed with a message MSG. */
2427
2428 static void
2429 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2430 {
2431   dump_printf_loc (msg_type, vect_location, "%s", msg);
2432   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2433 }
2434
2435
2436 /* Detect SLP reduction of the form:
2437
2438    #a1 = phi <a5, a0>
2439    a2 = operation (a1)
2440    a3 = operation (a2)
2441    a4 = operation (a3)
2442    a5 = operation (a4)
2443
2444    #a = phi <a5>
2445
2446    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2447    FIRST_STMT is the first reduction stmt in the chain
2448    (a2 = operation (a1)).
2449
2450    Return TRUE if a reduction chain was detected.  */
2451
2452 static bool
2453 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2454                        gimple *first_stmt)
2455 {
2456   struct loop *loop = (gimple_bb (phi))->loop_father;
2457   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2458   enum tree_code code;
2459   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2460   stmt_vec_info use_stmt_info, current_stmt_info;
2461   tree lhs;
2462   imm_use_iterator imm_iter;
2463   use_operand_p use_p;
2464   int nloop_uses, size = 0, n_out_of_loop_uses;
2465   bool found = false;
2466
2467   if (loop != vect_loop)
2468     return false;
2469
2470   lhs = PHI_RESULT (phi);
2471   code = gimple_assign_rhs_code (first_stmt);
2472   while (1)
2473     {
2474       nloop_uses = 0;
2475       n_out_of_loop_uses = 0;
2476       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2477         {
2478           gimple *use_stmt = USE_STMT (use_p);
2479           if (is_gimple_debug (use_stmt))
2480             continue;
2481
2482           /* Check if we got back to the reduction phi.  */
2483           if (use_stmt == phi)
2484             {
2485               loop_use_stmt = use_stmt;
2486               found = true;
2487               break;
2488             }
2489
2490           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2491             {
2492               loop_use_stmt = use_stmt;
2493               nloop_uses++;
2494             }
2495            else
2496              n_out_of_loop_uses++;
2497
2498            /* There are can be either a single use in the loop or two uses in
2499               phi nodes.  */
2500            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2501              return false;
2502         }
2503
2504       if (found)
2505         break;
2506
2507       /* We reached a statement with no loop uses.  */
2508       if (nloop_uses == 0)
2509         return false;
2510
2511       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2512       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2513         return false;
2514
2515       if (!is_gimple_assign (loop_use_stmt)
2516           || code != gimple_assign_rhs_code (loop_use_stmt)
2517           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2518         return false;
2519
2520       /* Insert USE_STMT into reduction chain.  */
2521       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2522       if (current_stmt)
2523         {
2524           current_stmt_info = vinfo_for_stmt (current_stmt);
2525           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2526           GROUP_FIRST_ELEMENT (use_stmt_info)
2527             = GROUP_FIRST_ELEMENT (current_stmt_info);
2528         }
2529       else
2530         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2531
2532       lhs = gimple_assign_lhs (loop_use_stmt);
2533       current_stmt = loop_use_stmt;
2534       size++;
2535    }
2536
2537   if (!found || loop_use_stmt != phi || size < 2)
2538     return false;
2539
2540   /* Swap the operands, if needed, to make the reduction operand be the second
2541      operand.  */
2542   lhs = PHI_RESULT (phi);
2543   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2544   while (next_stmt)
2545     {
2546       if (gimple_assign_rhs2 (next_stmt) == lhs)
2547         {
2548           tree op = gimple_assign_rhs1 (next_stmt);
2549           gimple *def_stmt = NULL;
2550
2551           if (TREE_CODE (op) == SSA_NAME)
2552             def_stmt = SSA_NAME_DEF_STMT (op);
2553
2554           /* Check that the other def is either defined in the loop
2555              ("vect_internal_def"), or it's an induction (defined by a
2556              loop-header phi-node).  */
2557           if (def_stmt
2558               && gimple_bb (def_stmt)
2559               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2560               && (is_gimple_assign (def_stmt)
2561                   || is_gimple_call (def_stmt)
2562                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2563                            == vect_induction_def
2564                   || (gimple_code (def_stmt) == GIMPLE_PHI
2565                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2566                                   == vect_internal_def
2567                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2568             {
2569               lhs = gimple_assign_lhs (next_stmt);
2570               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2571               continue;
2572             }
2573
2574           return false;
2575         }
2576       else
2577         {
2578           tree op = gimple_assign_rhs2 (next_stmt);
2579           gimple *def_stmt = NULL;
2580
2581           if (TREE_CODE (op) == SSA_NAME)
2582             def_stmt = SSA_NAME_DEF_STMT (op);
2583
2584           /* Check that the other def is either defined in the loop
2585             ("vect_internal_def"), or it's an induction (defined by a
2586             loop-header phi-node).  */
2587           if (def_stmt
2588               && gimple_bb (def_stmt)
2589               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2590               && (is_gimple_assign (def_stmt)
2591                   || is_gimple_call (def_stmt)
2592                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2593                               == vect_induction_def
2594                   || (gimple_code (def_stmt) == GIMPLE_PHI
2595                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2596                                   == vect_internal_def
2597                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2598             {
2599               if (dump_enabled_p ())
2600                 {
2601                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2602                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2603                 }
2604
2605               swap_ssa_operands (next_stmt,
2606                                  gimple_assign_rhs1_ptr (next_stmt),
2607                                  gimple_assign_rhs2_ptr (next_stmt));
2608               update_stmt (next_stmt);
2609
2610               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2611                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2612             }
2613           else
2614             return false;
2615         }
2616
2617       lhs = gimple_assign_lhs (next_stmt);
2618       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2619     }
2620
2621   /* Save the chain for further analysis in SLP detection.  */
2622   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2623   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2624   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2625
2626   return true;
2627 }
2628
2629
2630 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2631    reduction operation CODE has a handled computation expression.  */
2632
2633 bool
2634 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2635                       enum tree_code code)
2636 {
2637   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2638   auto_bitmap visited;
2639   tree lookfor = PHI_RESULT (phi);
2640   ssa_op_iter curri;
2641   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2642   while (USE_FROM_PTR (curr) != loop_arg)
2643     curr = op_iter_next_use (&curri);
2644   curri.i = curri.numops;
2645   do
2646     {
2647       path.safe_push (std::make_pair (curri, curr));
2648       tree use = USE_FROM_PTR (curr);
2649       if (use == lookfor)
2650         break;
2651       gimple *def = SSA_NAME_DEF_STMT (use);
2652       if (gimple_nop_p (def)
2653           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2654         {
2655 pop:
2656           do
2657             {
2658               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2659               curri = x.first;
2660               curr = x.second;
2661               do
2662                 curr = op_iter_next_use (&curri);
2663               /* Skip already visited or non-SSA operands (from iterating
2664                  over PHI args).  */
2665               while (curr != NULL_USE_OPERAND_P
2666                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2667                          || ! bitmap_set_bit (visited,
2668                                               SSA_NAME_VERSION
2669                                                 (USE_FROM_PTR (curr)))));
2670             }
2671           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2672           if (curr == NULL_USE_OPERAND_P)
2673             break;
2674         }
2675       else
2676         {
2677           if (gimple_code (def) == GIMPLE_PHI)
2678             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2679           else
2680             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2681           while (curr != NULL_USE_OPERAND_P
2682                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2683                      || ! bitmap_set_bit (visited,
2684                                           SSA_NAME_VERSION
2685                                             (USE_FROM_PTR (curr)))))
2686             curr = op_iter_next_use (&curri);
2687           if (curr == NULL_USE_OPERAND_P)
2688             goto pop;
2689         }
2690     }
2691   while (1);
2692   if (dump_file && (dump_flags & TDF_DETAILS))
2693     {
2694       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2695       unsigned i;
2696       std::pair<ssa_op_iter, use_operand_p> *x;
2697       FOR_EACH_VEC_ELT (path, i, x)
2698         {
2699           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2700           dump_printf (MSG_NOTE, " ");
2701         }
2702       dump_printf (MSG_NOTE, "\n");
2703     }
2704
2705   /* Check whether the reduction path detected is valid.  */
2706   bool fail = path.length () == 0;
2707   bool neg = false;
2708   for (unsigned i = 1; i < path.length (); ++i)
2709     {
2710       gimple *use_stmt = USE_STMT (path[i].second);
2711       tree op = USE_FROM_PTR (path[i].second);
2712       if (! has_single_use (op)
2713           || ! is_gimple_assign (use_stmt))
2714         {
2715           fail = true;
2716           break;
2717         }
2718       if (gimple_assign_rhs_code (use_stmt) != code)
2719         {
2720           if (code == PLUS_EXPR
2721               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2722             {
2723               /* Track whether we negate the reduction value each iteration.  */
2724               if (gimple_assign_rhs2 (use_stmt) == op)
2725                 neg = ! neg;
2726             }
2727           else
2728             {
2729               fail = true;
2730               break;
2731             }
2732         }
2733     }
2734   return ! fail && ! neg;
2735 }
2736
2737
2738 /* Function vect_is_simple_reduction
2739
2740    (1) Detect a cross-iteration def-use cycle that represents a simple
2741    reduction computation.  We look for the following pattern:
2742
2743    loop_header:
2744      a1 = phi < a0, a2 >
2745      a3 = ...
2746      a2 = operation (a3, a1)
2747
2748    or
2749
2750    a3 = ...
2751    loop_header:
2752      a1 = phi < a0, a2 >
2753      a2 = operation (a3, a1)
2754
2755    such that:
2756    1. operation is commutative and associative and it is safe to
2757       change the order of the computation
2758    2. no uses for a2 in the loop (a2 is used out of the loop)
2759    3. no uses of a1 in the loop besides the reduction operation
2760    4. no uses of a1 outside the loop.
2761
2762    Conditions 1,4 are tested here.
2763    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2764
2765    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2766    nested cycles.
2767
2768    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2769    reductions:
2770
2771      a1 = phi < a0, a2 >
2772      inner loop (def of a3)
2773      a2 = phi < a3 >
2774
2775    (4) Detect condition expressions, ie:
2776      for (int i = 0; i < N; i++)
2777        if (a[i] < val)
2778         ret_val = a[i];
2779
2780 */
2781
2782 static gimple *
2783 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2784                           bool *double_reduc,
2785                           bool need_wrapping_integral_overflow,
2786                           enum vect_reduction_type *v_reduc_type)
2787 {
2788   struct loop *loop = (gimple_bb (phi))->loop_father;
2789   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2790   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2791   enum tree_code orig_code, code;
2792   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2793   tree type;
2794   int nloop_uses;
2795   tree name;
2796   imm_use_iterator imm_iter;
2797   use_operand_p use_p;
2798   bool phi_def;
2799
2800   *double_reduc = false;
2801   *v_reduc_type = TREE_CODE_REDUCTION;
2802
2803   tree phi_name = PHI_RESULT (phi);
2804   /* ???  If there are no uses of the PHI result the inner loop reduction
2805      won't be detected as possibly double-reduction by vectorizable_reduction
2806      because that tries to walk the PHI arg from the preheader edge which
2807      can be constant.  See PR60382.  */
2808   if (has_zero_uses (phi_name))
2809     return NULL;
2810   nloop_uses = 0;
2811   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2812     {
2813       gimple *use_stmt = USE_STMT (use_p);
2814       if (is_gimple_debug (use_stmt))
2815         continue;
2816
2817       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2818         {
2819           if (dump_enabled_p ())
2820             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2821                              "intermediate value used outside loop.\n");
2822
2823           return NULL;
2824         }
2825
2826       nloop_uses++;
2827       if (nloop_uses > 1)
2828         {
2829           if (dump_enabled_p ())
2830             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2831                              "reduction value used in loop.\n");
2832           return NULL;
2833         }
2834
2835       phi_use_stmt = use_stmt;
2836     }
2837
2838   edge latch_e = loop_latch_edge (loop);
2839   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2840   if (TREE_CODE (loop_arg) != SSA_NAME)
2841     {
2842       if (dump_enabled_p ())
2843         {
2844           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2845                            "reduction: not ssa_name: ");
2846           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2847           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2848         }
2849       return NULL;
2850     }
2851
2852   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2853   if (is_gimple_assign (def_stmt))
2854     {
2855       name = gimple_assign_lhs (def_stmt);
2856       phi_def = false;
2857     }
2858   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2859     {
2860       name = PHI_RESULT (def_stmt);
2861       phi_def = true;
2862     }
2863   else
2864     {
2865       if (dump_enabled_p ())
2866         {
2867           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2868                            "reduction: unhandled reduction operation: ");
2869           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2870         }
2871       return NULL;
2872     }
2873
2874   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2875     return NULL;
2876
2877   nloop_uses = 0;
2878   auto_vec<gphi *, 3> lcphis;
2879   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2880     {
2881       gimple *use_stmt = USE_STMT (use_p);
2882       if (is_gimple_debug (use_stmt))
2883         continue;
2884       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2885         nloop_uses++;
2886       else
2887         /* We can have more than one loop-closed PHI.  */
2888         lcphis.safe_push (as_a <gphi *> (use_stmt));
2889       if (nloop_uses > 1)
2890         {
2891           if (dump_enabled_p ())
2892             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2893                              "reduction used in loop.\n");
2894           return NULL;
2895         }
2896     }
2897
2898   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2899      defined in the inner loop.  */
2900   if (phi_def)
2901     {
2902       op1 = PHI_ARG_DEF (def_stmt, 0);
2903
2904       if (gimple_phi_num_args (def_stmt) != 1
2905           || TREE_CODE (op1) != SSA_NAME)
2906         {
2907           if (dump_enabled_p ())
2908             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2909                              "unsupported phi node definition.\n");
2910
2911           return NULL;
2912         }
2913
2914       def1 = SSA_NAME_DEF_STMT (op1);
2915       if (gimple_bb (def1)
2916           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2917           && loop->inner
2918           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2919           && is_gimple_assign (def1)
2920           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2921         {
2922           if (dump_enabled_p ())
2923             report_vect_op (MSG_NOTE, def_stmt,
2924                             "detected double reduction: ");
2925
2926           *double_reduc = true;
2927           return def_stmt;
2928         }
2929
2930       return NULL;
2931     }
2932
2933   /* If we are vectorizing an inner reduction we are executing that
2934      in the original order only in case we are not dealing with a
2935      double reduction.  */
2936   bool check_reduction = true;
2937   if (flow_loop_nested_p (vect_loop, loop))
2938     {
2939       gphi *lcphi;
2940       unsigned i;
2941       check_reduction = false;
2942       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2943         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2944           {
2945             gimple *use_stmt = USE_STMT (use_p);
2946             if (is_gimple_debug (use_stmt))
2947               continue;
2948             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2949               check_reduction = true;
2950           }
2951     }
2952
2953   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2954   code = orig_code = gimple_assign_rhs_code (def_stmt);
2955
2956   /* We can handle "res -= x[i]", which is non-associative by
2957      simply rewriting this into "res += -x[i]".  Avoid changing
2958      gimple instruction for the first simple tests and only do this
2959      if we're allowed to change code at all.  */
2960   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2961     code = PLUS_EXPR;
2962
2963   if (code == COND_EXPR)
2964     {
2965       if (! nested_in_vect_loop)
2966         *v_reduc_type = COND_REDUCTION;
2967
2968       op3 = gimple_assign_rhs1 (def_stmt);
2969       if (COMPARISON_CLASS_P (op3))
2970         {
2971           op4 = TREE_OPERAND (op3, 1);
2972           op3 = TREE_OPERAND (op3, 0);
2973         }
2974       if (op3 == phi_name || op4 == phi_name)
2975         {
2976           if (dump_enabled_p ())
2977             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2978                             "reduction: condition depends on previous"
2979                             " iteration: ");
2980           return NULL;
2981         }
2982
2983       op1 = gimple_assign_rhs2 (def_stmt);
2984       op2 = gimple_assign_rhs3 (def_stmt);
2985     }
2986   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2987     {
2988       if (dump_enabled_p ())
2989         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2990                         "reduction: not commutative/associative: ");
2991       return NULL;
2992     }
2993   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2994     {
2995       op1 = gimple_assign_rhs1 (def_stmt);
2996       op2 = gimple_assign_rhs2 (def_stmt);
2997     }
2998   else
2999     {
3000       if (dump_enabled_p ())
3001         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3002                         "reduction: not handled operation: ");
3003       return NULL;
3004     }
3005
3006   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3007     {
3008       if (dump_enabled_p ())
3009         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3010                         "reduction: both uses not ssa_names: ");
3011
3012       return NULL;
3013     }
3014
3015   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3016   if ((TREE_CODE (op1) == SSA_NAME
3017        && !types_compatible_p (type,TREE_TYPE (op1)))
3018       || (TREE_CODE (op2) == SSA_NAME
3019           && !types_compatible_p (type, TREE_TYPE (op2)))
3020       || (op3 && TREE_CODE (op3) == SSA_NAME
3021           && !types_compatible_p (type, TREE_TYPE (op3)))
3022       || (op4 && TREE_CODE (op4) == SSA_NAME
3023           && !types_compatible_p (type, TREE_TYPE (op4))))
3024     {
3025       if (dump_enabled_p ())
3026         {
3027           dump_printf_loc (MSG_NOTE, vect_location,
3028                            "reduction: multiple types: operation type: ");
3029           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3030           dump_printf (MSG_NOTE, ", operands types: ");
3031           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3032                              TREE_TYPE (op1));
3033           dump_printf (MSG_NOTE, ",");
3034           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3035                              TREE_TYPE (op2));
3036           if (op3)
3037             {
3038               dump_printf (MSG_NOTE, ",");
3039               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3040                                  TREE_TYPE (op3));
3041             }
3042
3043           if (op4)
3044             {
3045               dump_printf (MSG_NOTE, ",");
3046               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3047                                  TREE_TYPE (op4));
3048             }
3049           dump_printf (MSG_NOTE, "\n");
3050         }
3051
3052       return NULL;
3053     }
3054
3055   /* Check that it's ok to change the order of the computation.
3056      Generally, when vectorizing a reduction we change the order of the
3057      computation.  This may change the behavior of the program in some
3058      cases, so we need to check that this is ok.  One exception is when
3059      vectorizing an outer-loop: the inner-loop is executed sequentially,
3060      and therefore vectorizing reductions in the inner-loop during
3061      outer-loop vectorization is safe.  */
3062
3063   if (*v_reduc_type != COND_REDUCTION
3064       && check_reduction)
3065     {
3066       /* CHECKME: check for !flag_finite_math_only too?  */
3067       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3068         {
3069           /* Changing the order of operations changes the semantics.  */
3070           if (dump_enabled_p ())
3071             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3072                         "reduction: unsafe fp math optimization: ");
3073           return NULL;
3074         }
3075       else if (INTEGRAL_TYPE_P (type))
3076         {
3077           if (!operation_no_trapping_overflow (type, code))
3078             {
3079               /* Changing the order of operations changes the semantics.  */
3080               if (dump_enabled_p ())
3081                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3082                                 "reduction: unsafe int math optimization"
3083                                 " (overflow traps): ");
3084               return NULL;
3085             }
3086           if (need_wrapping_integral_overflow
3087               && !TYPE_OVERFLOW_WRAPS (type)
3088               && operation_can_overflow (code))
3089             {
3090               /* Changing the order of operations changes the semantics.  */
3091               if (dump_enabled_p ())
3092                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3093                                 "reduction: unsafe int math optimization"
3094                                 " (overflow doesn't wrap): ");
3095               return NULL;
3096             }
3097         }
3098       else if (SAT_FIXED_POINT_TYPE_P (type))
3099         {
3100           /* Changing the order of operations changes the semantics.  */
3101           if (dump_enabled_p ())
3102           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3103                           "reduction: unsafe fixed-point math optimization: ");
3104           return NULL;
3105         }
3106     }
3107
3108   /* Reduction is safe. We're dealing with one of the following:
3109      1) integer arithmetic and no trapv
3110      2) floating point arithmetic, and special flags permit this optimization
3111      3) nested cycle (i.e., outer loop vectorization).  */
3112   if (TREE_CODE (op1) == SSA_NAME)
3113     def1 = SSA_NAME_DEF_STMT (op1);
3114
3115   if (TREE_CODE (op2) == SSA_NAME)
3116     def2 = SSA_NAME_DEF_STMT (op2);
3117
3118   if (code != COND_EXPR
3119       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3120     {
3121       if (dump_enabled_p ())
3122         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3123       return NULL;
3124     }
3125
3126   /* Check that one def is the reduction def, defined by PHI,
3127      the other def is either defined in the loop ("vect_internal_def"),
3128      or it's an induction (defined by a loop-header phi-node).  */
3129
3130   if (def2 && def2 == phi
3131       && (code == COND_EXPR
3132           || !def1 || gimple_nop_p (def1)
3133           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3134           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3135               && (is_gimple_assign (def1)
3136                   || is_gimple_call (def1)
3137                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3138                       == vect_induction_def
3139                   || (gimple_code (def1) == GIMPLE_PHI
3140                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3141                           == vect_internal_def
3142                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3143     {
3144       if (dump_enabled_p ())
3145         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3146       return def_stmt;
3147     }
3148
3149   if (def1 && def1 == phi
3150       && (code == COND_EXPR
3151           || !def2 || gimple_nop_p (def2)
3152           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3153           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3154               && (is_gimple_assign (def2)
3155                   || is_gimple_call (def2)
3156                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3157                        == vect_induction_def
3158                   || (gimple_code (def2) == GIMPLE_PHI
3159                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3160                            == vect_internal_def
3161                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3162     {
3163       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3164         {
3165           /* Check if we can swap operands (just for simplicity - so that
3166              the rest of the code can assume that the reduction variable
3167              is always the last (second) argument).  */
3168           if (code == COND_EXPR)
3169             {
3170               /* Swap cond_expr by inverting the condition.  */
3171               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3172               enum tree_code invert_code = ERROR_MARK;
3173               enum tree_code cond_code = TREE_CODE (cond_expr);
3174
3175               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3176                 {
3177                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3178                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3179                 }
3180               if (invert_code != ERROR_MARK)
3181                 {
3182                   TREE_SET_CODE (cond_expr, invert_code);
3183                   swap_ssa_operands (def_stmt,
3184                                      gimple_assign_rhs2_ptr (def_stmt),
3185                                      gimple_assign_rhs3_ptr (def_stmt));
3186                 }
3187               else
3188                 {
3189                   if (dump_enabled_p ())
3190                     report_vect_op (MSG_NOTE, def_stmt,
3191                                     "detected reduction: cannot swap operands "
3192                                     "for cond_expr");
3193                   return NULL;
3194                 }
3195             }
3196           else
3197             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3198                                gimple_assign_rhs2_ptr (def_stmt));
3199
3200           if (dump_enabled_p ())
3201             report_vect_op (MSG_NOTE, def_stmt,
3202                             "detected reduction: need to swap operands: ");
3203
3204           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3205             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3206         }
3207       else
3208         {
3209           if (dump_enabled_p ())
3210             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3211         }
3212
3213       return def_stmt;
3214     }
3215
3216   /* Try to find SLP reduction chain.  */
3217   if (! nested_in_vect_loop
3218       && code != COND_EXPR
3219       && orig_code != MINUS_EXPR
3220       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3221     {
3222       if (dump_enabled_p ())
3223         report_vect_op (MSG_NOTE, def_stmt,
3224                         "reduction: detected reduction chain: ");
3225
3226       return def_stmt;
3227     }
3228
3229   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3230   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3231   while (first)
3232     {
3233       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3234       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3235       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3236       first = next;
3237     }
3238
3239   /* Look for the expression computing loop_arg from loop PHI result.  */
3240   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3241                             code))
3242     return def_stmt;
3243
3244   if (dump_enabled_p ())
3245     {
3246       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3247                       "reduction: unknown pattern: ");
3248     }
3249
3250   return NULL;
3251 }
3252
3253 /* Wrapper around vect_is_simple_reduction, which will modify code
3254    in-place if it enables detection of more reductions.  Arguments
3255    as there.  */
3256
3257 gimple *
3258 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3259                              bool *double_reduc,
3260                              bool need_wrapping_integral_overflow)
3261 {
3262   enum vect_reduction_type v_reduc_type;
3263   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3264                                           need_wrapping_integral_overflow,
3265                                           &v_reduc_type);
3266   if (def)
3267     {
3268       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3269       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3270       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3271       reduc_def_info = vinfo_for_stmt (def);
3272       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3273     }
3274   return def;
3275 }
3276
3277 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3278 int
3279 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3280                              int *peel_iters_epilogue,
3281                              stmt_vector_for_cost *scalar_cost_vec,
3282                              stmt_vector_for_cost *prologue_cost_vec,
3283                              stmt_vector_for_cost *epilogue_cost_vec)
3284 {
3285   int retval = 0;
3286   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3287
3288   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3289     {
3290       *peel_iters_epilogue = vf/2;
3291       if (dump_enabled_p ())
3292         dump_printf_loc (MSG_NOTE, vect_location,
3293                          "cost model: epilogue peel iters set to vf/2 "
3294                          "because loop iterations are unknown .\n");
3295
3296       /* If peeled iterations are known but number of scalar loop
3297          iterations are unknown, count a taken branch per peeled loop.  */
3298       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3299                                  NULL, 0, vect_prologue);
3300       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3301                                  NULL, 0, vect_epilogue);
3302     }
3303   else
3304     {
3305       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3306       peel_iters_prologue = niters < peel_iters_prologue ?
3307                             niters : peel_iters_prologue;
3308       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3309       /* If we need to peel for gaps, but no peeling is required, we have to
3310          peel VF iterations.  */
3311       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3312         *peel_iters_epilogue = vf;
3313     }
3314
3315   stmt_info_for_cost *si;
3316   int j;
3317   if (peel_iters_prologue)
3318     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3319         {
3320           stmt_vec_info stmt_info
3321             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3322           retval += record_stmt_cost (prologue_cost_vec,
3323                                       si->count * peel_iters_prologue,
3324                                       si->kind, stmt_info, si->misalign,
3325                                       vect_prologue);
3326         }
3327   if (*peel_iters_epilogue)
3328     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3329         {
3330           stmt_vec_info stmt_info
3331             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3332           retval += record_stmt_cost (epilogue_cost_vec,
3333                                       si->count * *peel_iters_epilogue,
3334                                       si->kind, stmt_info, si->misalign,
3335                                       vect_epilogue);
3336         }
3337
3338   return retval;
3339 }
3340
3341 /* Function vect_estimate_min_profitable_iters
3342
3343    Return the number of iterations required for the vector version of the
3344    loop to be profitable relative to the cost of the scalar version of the
3345    loop.
3346
3347    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3348    of iterations for vectorization.  -1 value means loop vectorization
3349    is not profitable.  This returned value may be used for dynamic
3350    profitability check.
3351
3352    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3353    for static check against estimated number of iterations.  */
3354
3355 static void
3356 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3357                                     int *ret_min_profitable_niters,
3358                                     int *ret_min_profitable_estimate)
3359 {
3360   int min_profitable_iters;
3361   int min_profitable_estimate;
3362   int peel_iters_prologue;
3363   int peel_iters_epilogue;
3364   unsigned vec_inside_cost = 0;
3365   int vec_outside_cost = 0;
3366   unsigned vec_prologue_cost = 0;
3367   unsigned vec_epilogue_cost = 0;
3368   int scalar_single_iter_cost = 0;
3369   int scalar_outside_cost = 0;
3370   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3371   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3372   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3373
3374   /* Cost model disabled.  */
3375   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3376     {
3377       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3378       *ret_min_profitable_niters = 0;
3379       *ret_min_profitable_estimate = 0;
3380       return;
3381     }
3382
3383   /* Requires loop versioning tests to handle misalignment.  */
3384   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3385     {
3386       /*  FIXME: Make cost depend on complexity of individual check.  */
3387       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3388       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3389                             vect_prologue);
3390       dump_printf (MSG_NOTE,
3391                    "cost model: Adding cost of checks for loop "
3392                    "versioning to treat misalignment.\n");
3393     }
3394
3395   /* Requires loop versioning with alias checks.  */
3396   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3397     {
3398       /*  FIXME: Make cost depend on complexity of individual check.  */
3399       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3400       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3401                             vect_prologue);
3402       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3403       if (len)
3404         /* Count LEN - 1 ANDs and LEN comparisons.  */
3405         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3406                               NULL, 0, vect_prologue);
3407       dump_printf (MSG_NOTE,
3408                    "cost model: Adding cost of checks for loop "
3409                    "versioning aliasing.\n");
3410     }
3411
3412   /* Requires loop versioning with niter checks.  */
3413   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3414     {
3415       /*  FIXME: Make cost depend on complexity of individual check.  */
3416       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3417                             vect_prologue);
3418       dump_printf (MSG_NOTE,
3419                    "cost model: Adding cost of checks for loop "
3420                    "versioning niters.\n");
3421     }
3422
3423   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3424     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3425                           vect_prologue);
3426
3427   /* Count statements in scalar loop.  Using this as scalar cost for a single
3428      iteration for now.
3429
3430      TODO: Add outer loop support.
3431
3432      TODO: Consider assigning different costs to different scalar
3433      statements.  */
3434
3435   scalar_single_iter_cost
3436     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3437
3438   /* Add additional cost for the peeled instructions in prologue and epilogue
3439      loop.
3440
3441      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3442      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3443
3444      TODO: Build an expression that represents peel_iters for prologue and
3445      epilogue to be used in a run-time test.  */
3446
3447   if (npeel  < 0)
3448     {
3449       peel_iters_prologue = vf/2;
3450       dump_printf (MSG_NOTE, "cost model: "
3451                    "prologue peel iters set to vf/2.\n");
3452
3453       /* If peeling for alignment is unknown, loop bound of main loop becomes
3454          unknown.  */
3455       peel_iters_epilogue = vf/2;
3456       dump_printf (MSG_NOTE, "cost model: "
3457                    "epilogue peel iters set to vf/2 because "
3458                    "peeling for alignment is unknown.\n");
3459
3460       /* If peeled iterations are unknown, count a taken branch and a not taken
3461          branch per peeled loop. Even if scalar loop iterations are known,
3462          vector iterations are not known since peeled prologue iterations are
3463          not known. Hence guards remain the same.  */
3464       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3465                             NULL, 0, vect_prologue);
3466       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3467                             NULL, 0, vect_prologue);
3468       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3469                             NULL, 0, vect_epilogue);
3470       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3471                             NULL, 0, vect_epilogue);
3472       stmt_info_for_cost *si;
3473       int j;
3474       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3475         {
3476           struct _stmt_vec_info *stmt_info
3477             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3478           (void) add_stmt_cost (target_cost_data,
3479                                 si->count * peel_iters_prologue,
3480                                 si->kind, stmt_info, si->misalign,
3481                                 vect_prologue);
3482           (void) add_stmt_cost (target_cost_data,
3483                                 si->count * peel_iters_epilogue,
3484                                 si->kind, stmt_info, si->misalign,
3485                                 vect_epilogue);
3486         }
3487     }
3488   else
3489     {
3490       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3491       stmt_info_for_cost *si;
3492       int j;
3493       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3494
3495       prologue_cost_vec.create (2);
3496       epilogue_cost_vec.create (2);
3497       peel_iters_prologue = npeel;
3498
3499       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3500                                           &peel_iters_epilogue,
3501                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3502                                             (loop_vinfo),
3503                                           &prologue_cost_vec,
3504                                           &epilogue_cost_vec);
3505
3506       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3507         {
3508           struct _stmt_vec_info *stmt_info
3509             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3510           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3511                                 si->misalign, vect_prologue);
3512         }
3513
3514       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3515         {
3516           struct _stmt_vec_info *stmt_info
3517             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3518           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3519                                 si->misalign, vect_epilogue);
3520         }
3521
3522       prologue_cost_vec.release ();
3523       epilogue_cost_vec.release ();
3524     }
3525
3526   /* FORNOW: The scalar outside cost is incremented in one of the
3527      following ways:
3528
3529      1. The vectorizer checks for alignment and aliasing and generates
3530      a condition that allows dynamic vectorization.  A cost model
3531      check is ANDED with the versioning condition.  Hence scalar code
3532      path now has the added cost of the versioning check.
3533
3534        if (cost > th & versioning_check)
3535          jmp to vector code
3536
3537      Hence run-time scalar is incremented by not-taken branch cost.
3538
3539      2. The vectorizer then checks if a prologue is required.  If the
3540      cost model check was not done before during versioning, it has to
3541      be done before the prologue check.
3542
3543        if (cost <= th)
3544          prologue = scalar_iters
3545        if (prologue == 0)
3546          jmp to vector code
3547        else
3548          execute prologue
3549        if (prologue == num_iters)
3550          go to exit
3551
3552      Hence the run-time scalar cost is incremented by a taken branch,
3553      plus a not-taken branch, plus a taken branch cost.
3554
3555      3. The vectorizer then checks if an epilogue is required.  If the
3556      cost model check was not done before during prologue check, it
3557      has to be done with the epilogue check.
3558
3559        if (prologue == 0)
3560          jmp to vector code
3561        else
3562          execute prologue
3563        if (prologue == num_iters)
3564          go to exit
3565        vector code:
3566          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3567            jmp to epilogue
3568
3569      Hence the run-time scalar cost should be incremented by 2 taken
3570      branches.
3571
3572      TODO: The back end may reorder the BBS's differently and reverse
3573      conditions/branch directions.  Change the estimates below to
3574      something more reasonable.  */
3575
3576   /* If the number of iterations is known and we do not do versioning, we can
3577      decide whether to vectorize at compile time.  Hence the scalar version
3578      do not carry cost model guard costs.  */
3579   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3580       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3581     {
3582       /* Cost model check occurs at versioning.  */
3583       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3584         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3585       else
3586         {
3587           /* Cost model check occurs at prologue generation.  */
3588           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3589             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3590               + vect_get_stmt_cost (cond_branch_not_taken);
3591           /* Cost model check occurs at epilogue generation.  */
3592           else
3593             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3594         }
3595     }
3596
3597   /* Complete the target-specific cost calculations.  */
3598   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3599                &vec_inside_cost, &vec_epilogue_cost);
3600
3601   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3602
3603   if (dump_enabled_p ())
3604     {
3605       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3606       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3607                    vec_inside_cost);
3608       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3609                    vec_prologue_cost);
3610       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3611                    vec_epilogue_cost);
3612       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3613                    scalar_single_iter_cost);
3614       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3615                    scalar_outside_cost);
3616       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3617                    vec_outside_cost);
3618       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3619                    peel_iters_prologue);
3620       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3621                    peel_iters_epilogue);
3622     }
3623
3624   /* Calculate number of iterations required to make the vector version
3625      profitable, relative to the loop bodies only.  The following condition
3626      must hold true:
3627      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3628      where
3629      SIC = scalar iteration cost, VIC = vector iteration cost,
3630      VOC = vector outside cost, VF = vectorization factor,
3631      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3632      SOC = scalar outside cost for run time cost model check.  */
3633
3634   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3635     {
3636       if (vec_outside_cost <= 0)
3637         min_profitable_iters = 0;
3638       else
3639         {
3640           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3641                                   - vec_inside_cost * peel_iters_prologue
3642                                   - vec_inside_cost * peel_iters_epilogue)
3643                                  / ((scalar_single_iter_cost * vf)
3644                                     - vec_inside_cost);
3645
3646           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3647               <= (((int) vec_inside_cost * min_profitable_iters)
3648                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3649             min_profitable_iters++;
3650         }
3651     }
3652   /* vector version will never be profitable.  */
3653   else
3654     {
3655       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3656         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3657                     "did not happen for a simd loop");
3658
3659       if (dump_enabled_p ())
3660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3661                          "cost model: the vector iteration cost = %d "
3662                          "divided by the scalar iteration cost = %d "
3663                          "is greater or equal to the vectorization factor = %d"
3664                          ".\n",
3665                          vec_inside_cost, scalar_single_iter_cost, vf);
3666       *ret_min_profitable_niters = -1;
3667       *ret_min_profitable_estimate = -1;
3668       return;
3669     }
3670
3671   dump_printf (MSG_NOTE,
3672                "  Calculated minimum iters for profitability: %d\n",
3673                min_profitable_iters);
3674
3675   /* We want the vectorized loop to execute at least once.  */
3676   if (min_profitable_iters < (vf + peel_iters_prologue))
3677     min_profitable_iters = vf + peel_iters_prologue;
3678
3679   if (dump_enabled_p ())
3680     dump_printf_loc (MSG_NOTE, vect_location,
3681                      "  Runtime profitability threshold = %d\n",
3682                      min_profitable_iters);
3683
3684   *ret_min_profitable_niters = min_profitable_iters;
3685
3686   /* Calculate number of iterations required to make the vector version
3687      profitable, relative to the loop bodies only.
3688
3689      Non-vectorized variant is SIC * niters and it must win over vector
3690      variant on the expected loop trip count.  The following condition must hold true:
3691      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3692
3693   if (vec_outside_cost <= 0)
3694     min_profitable_estimate = 0;
3695   else
3696     {
3697       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3698                                  - vec_inside_cost * peel_iters_prologue
3699                                  - vec_inside_cost * peel_iters_epilogue)
3700                                  / ((scalar_single_iter_cost * vf)
3701                                    - vec_inside_cost);
3702     }
3703   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3704   if (dump_enabled_p ())
3705     dump_printf_loc (MSG_NOTE, vect_location,
3706                      "  Static estimate profitability threshold = %d\n",
3707                      min_profitable_estimate);
3708
3709   *ret_min_profitable_estimate = min_profitable_estimate;
3710 }
3711
3712 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3713    vector elements (not bits) for a vector with NELT elements.  */
3714 static void
3715 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3716                               vec_perm_indices *sel)
3717 {
3718   unsigned int i;
3719
3720   for (i = 0; i < nelt; i++)
3721     sel->quick_push ((i + offset) & (2 * nelt - 1));
3722 }
3723
3724 /* Checks whether the target supports whole-vector shifts for vectors of mode
3725    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3726    it supports vec_perm_const with masks for all necessary shift amounts.  */
3727 static bool
3728 have_whole_vector_shift (machine_mode mode)
3729 {
3730   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3731     return true;
3732
3733   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3734     return false;
3735
3736   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3737   auto_vec_perm_indices sel (nelt);
3738
3739   for (i = nelt/2; i >= 1; i/=2)
3740     {
3741       sel.truncate (0);
3742       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3743       if (!can_vec_perm_p (mode, false, &sel))
3744         return false;
3745     }
3746   return true;
3747 }
3748
3749 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3750    functions. Design better to avoid maintenance issues.  */
3751
3752 /* Function vect_model_reduction_cost.
3753
3754    Models cost for a reduction operation, including the vector ops
3755    generated within the strip-mine loop, the initial definition before
3756    the loop, and the epilogue code that must be generated.  */
3757
3758 static void
3759 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3760                            int ncopies)
3761 {
3762   int prologue_cost = 0, epilogue_cost = 0;
3763   enum tree_code code;
3764   optab optab;
3765   tree vectype;
3766   gimple *orig_stmt;
3767   machine_mode mode;
3768   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3769   struct loop *loop = NULL;
3770   void *target_cost_data;
3771
3772   if (loop_vinfo)
3773     {
3774       loop = LOOP_VINFO_LOOP (loop_vinfo);
3775       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3776     }
3777   else
3778     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3779
3780   /* Condition reductions generate two reductions in the loop.  */
3781   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3782     ncopies *= 2;
3783
3784   /* Cost of reduction op inside loop.  */
3785   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3786                                         stmt_info, 0, vect_body);
3787
3788   vectype = STMT_VINFO_VECTYPE (stmt_info);
3789   mode = TYPE_MODE (vectype);
3790   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3791
3792   if (!orig_stmt)
3793     orig_stmt = STMT_VINFO_STMT (stmt_info);
3794
3795   code = gimple_assign_rhs_code (orig_stmt);
3796
3797   /* Add in cost for initial definition.
3798      For cond reduction we have four vectors: initial index, step, initial
3799      result of the data reduction, initial value of the index reduction.  */
3800   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3801                        == COND_REDUCTION ? 4 : 1;
3802   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3803                                   scalar_to_vec, stmt_info, 0,
3804                                   vect_prologue);
3805
3806   /* Determine cost of epilogue code.
3807
3808      We have a reduction operator that will reduce the vector in one statement.
3809      Also requires scalar extract.  */
3810
3811   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3812     {
3813       if (reduc_fn != IFN_LAST)
3814         {
3815           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3816             {
3817               /* An EQ stmt and an COND_EXPR stmt.  */
3818               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3819                                               vector_stmt, stmt_info, 0,
3820                                               vect_epilogue);
3821               /* Reduction of the max index and a reduction of the found
3822                  values.  */
3823               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3824                                               vec_to_scalar, stmt_info, 0,
3825                                               vect_epilogue);
3826               /* A broadcast of the max value.  */
3827               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3828                                               scalar_to_vec, stmt_info, 0,
3829                                               vect_epilogue);
3830             }
3831           else
3832             {
3833               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3834                                               stmt_info, 0, vect_epilogue);
3835               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3836                                               vec_to_scalar, stmt_info, 0,
3837                                               vect_epilogue);
3838             }
3839         }
3840       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3841         {
3842           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3843           /* Extraction of scalar elements.  */
3844           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3845                                           vec_to_scalar, stmt_info, 0,
3846                                           vect_epilogue);
3847           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3848           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3849                                           scalar_stmt, stmt_info, 0,
3850                                           vect_epilogue);
3851         }
3852       else
3853         {
3854           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3855           tree bitsize =
3856             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3857           int element_bitsize = tree_to_uhwi (bitsize);
3858           int nelements = vec_size_in_bits / element_bitsize;
3859
3860           if (code == COND_EXPR)
3861             code = MAX_EXPR;
3862
3863           optab = optab_for_tree_code (code, vectype, optab_default);
3864
3865           /* We have a whole vector shift available.  */
3866           if (optab != unknown_optab
3867               && VECTOR_MODE_P (mode)
3868               && optab_handler (optab, mode) != CODE_FOR_nothing
3869               && have_whole_vector_shift (mode))
3870             {
3871               /* Final reduction via vector shifts and the reduction operator.
3872                  Also requires scalar extract.  */
3873               epilogue_cost += add_stmt_cost (target_cost_data,
3874                                               exact_log2 (nelements) * 2,
3875                                               vector_stmt, stmt_info, 0,
3876                                               vect_epilogue);
3877               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3878                                               vec_to_scalar, stmt_info, 0,
3879                                               vect_epilogue);
3880             }
3881           else
3882             /* Use extracts and reduction op for final reduction.  For N
3883                elements, we have N extracts and N-1 reduction ops.  */
3884             epilogue_cost += add_stmt_cost (target_cost_data,
3885                                             nelements + nelements - 1,
3886                                             vector_stmt, stmt_info, 0,
3887                                             vect_epilogue);
3888         }
3889     }
3890
3891   if (dump_enabled_p ())
3892     dump_printf (MSG_NOTE,
3893                  "vect_model_reduction_cost: inside_cost = %d, "
3894                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3895                  prologue_cost, epilogue_cost);
3896 }
3897
3898
3899 /* Function vect_model_induction_cost.
3900
3901    Models cost for induction operations.  */
3902
3903 static void
3904 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3905 {
3906   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3907   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3908   unsigned inside_cost, prologue_cost;
3909
3910   if (PURE_SLP_STMT (stmt_info))
3911     return;
3912
3913   /* loop cost for vec_loop.  */
3914   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3915                                stmt_info, 0, vect_body);
3916
3917   /* prologue cost for vec_init and vec_step.  */
3918   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3919                                  stmt_info, 0, vect_prologue);
3920
3921   if (dump_enabled_p ())
3922     dump_printf_loc (MSG_NOTE, vect_location,
3923                      "vect_model_induction_cost: inside_cost = %d, "
3924                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3925 }
3926
3927
3928
3929 /* Function get_initial_def_for_reduction
3930
3931    Input:
3932    STMT - a stmt that performs a reduction operation in the loop.
3933    INIT_VAL - the initial value of the reduction variable
3934
3935    Output:
3936    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3937         of the reduction (used for adjusting the epilog - see below).
3938    Return a vector variable, initialized according to the operation that STMT
3939         performs. This vector will be used as the initial value of the
3940         vector of partial results.
3941
3942    Option1 (adjust in epilog): Initialize the vector as follows:
3943      add/bit or/xor:    [0,0,...,0,0]
3944      mult/bit and:      [1,1,...,1,1]
3945      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3946    and when necessary (e.g. add/mult case) let the caller know
3947    that it needs to adjust the result by init_val.
3948
3949    Option2: Initialize the vector as follows:
3950      add/bit or/xor:    [init_val,0,0,...,0]
3951      mult/bit and:      [init_val,1,1,...,1]
3952      min/max/cond_expr: [init_val,init_val,...,init_val]
3953    and no adjustments are needed.
3954
3955    For example, for the following code:
3956
3957    s = init_val;
3958    for (i=0;i<n;i++)
3959      s = s + a[i];
3960
3961    STMT is 's = s + a[i]', and the reduction variable is 's'.
3962    For a vector of 4 units, we want to return either [0,0,0,init_val],
3963    or [0,0,0,0] and let the caller know that it needs to adjust
3964    the result at the end by 'init_val'.
3965
3966    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3967    initialization vector is simpler (same element in all entries), if
3968    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3969
3970    A cost model should help decide between these two schemes.  */
3971
3972 tree
3973 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3974                                tree *adjustment_def)
3975 {
3976   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3977   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3978   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3979   tree scalar_type = TREE_TYPE (init_val);
3980   tree vectype = get_vectype_for_scalar_type (scalar_type);
3981   enum tree_code code = gimple_assign_rhs_code (stmt);
3982   tree def_for_init;
3983   tree init_def;
3984   bool nested_in_vect_loop = false;
3985   REAL_VALUE_TYPE real_init_val = dconst0;
3986   int int_init_val = 0;
3987   gimple *def_stmt = NULL;
3988   gimple_seq stmts = NULL;
3989
3990   gcc_assert (vectype);
3991
3992   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3993               || SCALAR_FLOAT_TYPE_P (scalar_type));
3994
3995   if (nested_in_vect_loop_p (loop, stmt))
3996     nested_in_vect_loop = true;
3997   else
3998     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3999
4000   /* In case of double reduction we only create a vector variable to be put
4001      in the reduction phi node.  The actual statement creation is done in
4002      vect_create_epilog_for_reduction.  */
4003   if (adjustment_def && nested_in_vect_loop
4004       && TREE_CODE (init_val) == SSA_NAME
4005       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4006       && gimple_code (def_stmt) == GIMPLE_PHI
4007       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4008       && vinfo_for_stmt (def_stmt)
4009       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4010           == vect_double_reduction_def)
4011     {
4012       *adjustment_def = NULL;
4013       return vect_create_destination_var (init_val, vectype);
4014     }
4015
4016   /* In case of a nested reduction do not use an adjustment def as
4017      that case is not supported by the epilogue generation correctly
4018      if ncopies is not one.  */
4019   if (adjustment_def && nested_in_vect_loop)
4020     {
4021       *adjustment_def = NULL;
4022       return vect_get_vec_def_for_operand (init_val, stmt);
4023     }
4024
4025   switch (code)
4026     {
4027     case WIDEN_SUM_EXPR:
4028     case DOT_PROD_EXPR:
4029     case SAD_EXPR:
4030     case PLUS_EXPR:
4031     case MINUS_EXPR:
4032     case BIT_IOR_EXPR:
4033     case BIT_XOR_EXPR:
4034     case MULT_EXPR:
4035     case BIT_AND_EXPR:
4036       {
4037         /* ADJUSTMENT_DEF is NULL when called from
4038            vect_create_epilog_for_reduction to vectorize double reduction.  */
4039         if (adjustment_def)
4040           *adjustment_def = init_val;
4041
4042         if (code == MULT_EXPR)
4043           {
4044             real_init_val = dconst1;
4045             int_init_val = 1;
4046           }
4047
4048         if (code == BIT_AND_EXPR)
4049           int_init_val = -1;
4050
4051         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4052           def_for_init = build_real (scalar_type, real_init_val);
4053         else
4054           def_for_init = build_int_cst (scalar_type, int_init_val);
4055
4056         if (adjustment_def)
4057           /* Option1: the first element is '0' or '1' as well.  */
4058           init_def = gimple_build_vector_from_val (&stmts, vectype,
4059                                                    def_for_init);
4060         else
4061           {
4062             /* Option2: the first element is INIT_VAL.  */
4063             tree_vector_builder elts (vectype, 1, 2);
4064             elts.quick_push (init_val);
4065             elts.quick_push (def_for_init);
4066             init_def = gimple_build_vector (&stmts, &elts);
4067           }
4068       }
4069       break;
4070
4071     case MIN_EXPR:
4072     case MAX_EXPR:
4073     case COND_EXPR:
4074       {
4075         if (adjustment_def)
4076           {
4077             *adjustment_def = NULL_TREE;
4078             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4079               {
4080                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4081                 break;
4082               }
4083           }
4084         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4085         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4086       }
4087       break;
4088
4089     default:
4090       gcc_unreachable ();
4091     }
4092
4093   if (stmts)
4094     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4095   return init_def;
4096 }
4097
4098 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4099    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4100
4101 static void
4102 get_initial_defs_for_reduction (slp_tree slp_node,
4103                                 vec<tree> *vec_oprnds,
4104                                 unsigned int number_of_vectors,
4105                                 enum tree_code code, bool reduc_chain)
4106 {
4107   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4108   gimple *stmt = stmts[0];
4109   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4110   unsigned nunits;
4111   unsigned j, number_of_places_left_in_vector;
4112   tree vector_type, scalar_type;
4113   tree vop;
4114   int group_size = stmts.length ();
4115   unsigned int vec_num, i;
4116   unsigned number_of_copies = 1;
4117   vec<tree> voprnds;
4118   voprnds.create (number_of_vectors);
4119   tree neutral_op = NULL;
4120   struct loop *loop;
4121
4122   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4123   scalar_type = TREE_TYPE (vector_type);
4124   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4125
4126   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4127
4128   loop = (gimple_bb (stmt))->loop_father;
4129   gcc_assert (loop);
4130   edge pe = loop_preheader_edge (loop);
4131
4132   /* op is the reduction operand of the first stmt already.  */
4133   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4134      we need either neutral operands or the original operands.  See
4135      get_initial_def_for_reduction() for details.  */
4136   switch (code)
4137     {
4138     case WIDEN_SUM_EXPR:
4139     case DOT_PROD_EXPR:
4140     case SAD_EXPR:
4141     case PLUS_EXPR:
4142     case MINUS_EXPR:
4143     case BIT_IOR_EXPR:
4144     case BIT_XOR_EXPR:
4145       neutral_op = build_zero_cst (scalar_type);
4146       break;
4147
4148     case MULT_EXPR:
4149       neutral_op = build_one_cst (scalar_type);
4150       break;
4151
4152     case BIT_AND_EXPR:
4153       neutral_op = build_all_ones_cst (scalar_type);
4154       break;
4155
4156     /* For MIN/MAX we don't have an easy neutral operand but
4157        the initial values can be used fine here.  Only for
4158        a reduction chain we have to force a neutral element.  */
4159     case MAX_EXPR:
4160     case MIN_EXPR:
4161       if (! reduc_chain)
4162         neutral_op = NULL;
4163       else
4164         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4165       break;
4166
4167     default:
4168       gcc_assert (! reduc_chain);
4169       neutral_op = NULL;
4170     }
4171
4172   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4173      created vectors. It is greater than 1 if unrolling is performed.
4174
4175      For example, we have two scalar operands, s1 and s2 (e.g., group of
4176      strided accesses of size two), while NUNITS is four (i.e., four scalars
4177      of this type can be packed in a vector).  The output vector will contain
4178      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4179      will be 2).
4180
4181      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4182      containing the operands.
4183
4184      For example, NUNITS is four as before, and the group size is 8
4185      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4186      {s5, s6, s7, s8}.  */
4187
4188   number_of_copies = nunits * number_of_vectors / group_size;
4189
4190   number_of_places_left_in_vector = nunits;
4191   tree_vector_builder elts (vector_type, nunits, 1);
4192   elts.quick_grow (nunits);
4193   for (j = 0; j < number_of_copies; j++)
4194     {
4195       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4196         {
4197           tree op;
4198           /* Get the def before the loop.  In reduction chain we have only
4199              one initial value.  */
4200           if ((j != (number_of_copies - 1)
4201                || (reduc_chain && i != 0))
4202               && neutral_op)
4203             op = neutral_op;
4204           else
4205             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4206
4207           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4208           number_of_places_left_in_vector--;
4209           elts[number_of_places_left_in_vector] = op;
4210
4211           if (number_of_places_left_in_vector == 0)
4212             {
4213               gimple_seq ctor_seq = NULL;
4214               tree init = gimple_build_vector (&ctor_seq, &elts);
4215               if (ctor_seq != NULL)
4216                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4217               voprnds.quick_push (init);
4218
4219               number_of_places_left_in_vector = nunits;
4220               elts.new_vector (vector_type, nunits, 1);
4221               elts.quick_grow (nunits);
4222             }
4223         }
4224     }
4225
4226   /* Since the vectors are created in the reverse order, we should invert
4227      them.  */
4228   vec_num = voprnds.length ();
4229   for (j = vec_num; j != 0; j--)
4230     {
4231       vop = voprnds[j - 1];
4232       vec_oprnds->quick_push (vop);
4233     }
4234
4235   voprnds.release ();
4236
4237   /* In case that VF is greater than the unrolling factor needed for the SLP
4238      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4239      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4240      to replicate the vectors.  */
4241   tree neutral_vec = NULL;
4242   while (number_of_vectors > vec_oprnds->length ())
4243     {
4244       if (neutral_op)
4245         {
4246           if (!neutral_vec)
4247             {
4248               gimple_seq ctor_seq = NULL;
4249               neutral_vec = gimple_build_vector_from_val
4250                 (&ctor_seq, vector_type, neutral_op);
4251               if (ctor_seq != NULL)
4252                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4253             }
4254           vec_oprnds->quick_push (neutral_vec);
4255         }
4256       else
4257         {
4258           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4259             vec_oprnds->quick_push (vop);
4260         }
4261     }
4262 }
4263
4264
4265 /* Function vect_create_epilog_for_reduction
4266
4267    Create code at the loop-epilog to finalize the result of a reduction
4268    computation.
4269
4270    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4271      reduction statements.
4272    STMT is the scalar reduction stmt that is being vectorized.
4273    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4274      number of elements that we can fit in a vectype (nunits).  In this case
4275      we have to generate more than one vector stmt - i.e - we need to "unroll"
4276      the vector stmt by a factor VF/nunits.  For more details see documentation
4277      in vectorizable_operation.
4278    REDUC_FN is the internal function for the epilog reduction.
4279    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4280      computation.
4281    REDUC_INDEX is the index of the operand in the right hand side of the
4282      statement that is defined by REDUCTION_PHI.
4283    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4284    SLP_NODE is an SLP node containing a group of reduction statements. The
4285      first one in this group is STMT.
4286    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4287      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4288      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4289      any value of the IV in the loop.
4290    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4291
4292    This function:
4293    1. Creates the reduction def-use cycles: sets the arguments for
4294       REDUCTION_PHIS:
4295       The loop-entry argument is the vectorized initial-value of the reduction.
4296       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4297       sums.
4298    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4299       by calling the function specified by REDUC_FN if available, or by
4300       other means (whole-vector shifts or a scalar loop).
4301       The function also creates a new phi node at the loop exit to preserve
4302       loop-closed form, as illustrated below.
4303
4304      The flow at the entry to this function:
4305
4306         loop:
4307           vec_def = phi <null, null>            # REDUCTION_PHI
4308           VECT_DEF = vector_stmt                # vectorized form of STMT
4309           s_loop = scalar_stmt                  # (scalar) STMT
4310         loop_exit:
4311           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4312           use <s_out0>
4313           use <s_out0>
4314
4315      The above is transformed by this function into:
4316
4317         loop:
4318           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4319           VECT_DEF = vector_stmt                # vectorized form of STMT
4320           s_loop = scalar_stmt                  # (scalar) STMT
4321         loop_exit:
4322           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4323           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4324           v_out2 = reduce <v_out1>
4325           s_out3 = extract_field <v_out2, 0>
4326           s_out4 = adjust_result <s_out3>
4327           use <s_out4>
4328           use <s_out4>
4329 */
4330
4331 static void
4332 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4333                                   gimple *reduc_def_stmt,
4334                                   int ncopies, internal_fn reduc_fn,
4335                                   vec<gimple *> reduction_phis,
4336                                   bool double_reduc,
4337                                   slp_tree slp_node,
4338                                   slp_instance slp_node_instance,
4339                                   tree induc_val, enum tree_code induc_code)
4340 {
4341   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4342   stmt_vec_info prev_phi_info;
4343   tree vectype;
4344   machine_mode mode;
4345   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4346   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4347   basic_block exit_bb;
4348   tree scalar_dest;
4349   tree scalar_type;
4350   gimple *new_phi = NULL, *phi;
4351   gimple_stmt_iterator exit_gsi;
4352   tree vec_dest;
4353   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4354   gimple *epilog_stmt = NULL;
4355   enum tree_code code = gimple_assign_rhs_code (stmt);
4356   gimple *exit_phi;
4357   tree bitsize;
4358   tree adjustment_def = NULL;
4359   tree vec_initial_def = NULL;
4360   tree expr, def, initial_def = NULL;
4361   tree orig_name, scalar_result;
4362   imm_use_iterator imm_iter, phi_imm_iter;
4363   use_operand_p use_p, phi_use_p;
4364   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4365   bool nested_in_vect_loop = false;
4366   auto_vec<gimple *> new_phis;
4367   auto_vec<gimple *> inner_phis;
4368   enum vect_def_type dt = vect_unknown_def_type;
4369   int j, i;
4370   auto_vec<tree> scalar_results;
4371   unsigned int group_size = 1, k, ratio;
4372   auto_vec<tree> vec_initial_defs;
4373   auto_vec<gimple *> phis;
4374   bool slp_reduc = false;
4375   tree new_phi_result;
4376   gimple *inner_phi = NULL;
4377   tree induction_index = NULL_TREE;
4378
4379   if (slp_node)
4380     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4381
4382   if (nested_in_vect_loop_p (loop, stmt))
4383     {
4384       outer_loop = loop;
4385       loop = loop->inner;
4386       nested_in_vect_loop = true;
4387       gcc_assert (!slp_node);
4388     }
4389
4390   vectype = STMT_VINFO_VECTYPE (stmt_info);
4391   gcc_assert (vectype);
4392   mode = TYPE_MODE (vectype);
4393
4394   /* 1. Create the reduction def-use cycle:
4395      Set the arguments of REDUCTION_PHIS, i.e., transform
4396
4397         loop:
4398           vec_def = phi <null, null>            # REDUCTION_PHI
4399           VECT_DEF = vector_stmt                # vectorized form of STMT
4400           ...
4401
4402      into:
4403
4404         loop:
4405           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4406           VECT_DEF = vector_stmt                # vectorized form of STMT
4407           ...
4408
4409      (in case of SLP, do it for all the phis). */
4410
4411   /* Get the loop-entry arguments.  */
4412   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4413   if (slp_node)
4414     {
4415       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4416       vec_initial_defs.reserve (vec_num);
4417       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4418                                       &vec_initial_defs, vec_num, code,
4419                                       GROUP_FIRST_ELEMENT (stmt_info));
4420     }
4421   else
4422     {
4423       /* Get at the scalar def before the loop, that defines the initial value
4424          of the reduction variable.  */
4425       gimple *def_stmt;
4426       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4427                                            loop_preheader_edge (loop));
4428       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4429          and we can't use zero for induc_val, use initial_def.  Similarly
4430          for REDUC_MIN and initial_def larger than the base.  */
4431       if (TREE_CODE (initial_def) == INTEGER_CST
4432           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4433               == INTEGER_INDUC_COND_REDUCTION)
4434           && !integer_zerop (induc_val)
4435           && ((reduc_fn == IFN_REDUC_MAX
4436                && tree_int_cst_lt (initial_def, induc_val))
4437               || (reduc_fn == IFN_REDUC_MIN
4438                   && tree_int_cst_lt (induc_val, initial_def))))
4439         induc_val = initial_def;
4440       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4441       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4442                                                        &adjustment_def);
4443       vec_initial_defs.create (1);
4444       vec_initial_defs.quick_push (vec_initial_def);
4445     }
4446
4447   /* Set phi nodes arguments.  */
4448   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4449     {
4450       tree vec_init_def = vec_initial_defs[i];
4451       tree def = vect_defs[i];
4452       for (j = 0; j < ncopies; j++)
4453         {
4454           if (j != 0)
4455             {
4456               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4457               if (nested_in_vect_loop)
4458                 vec_init_def
4459                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4460                                                     vec_init_def);
4461             }
4462
4463           /* Set the loop-entry arg of the reduction-phi.  */
4464
4465           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4466               == INTEGER_INDUC_COND_REDUCTION)
4467             {
4468               /* Initialise the reduction phi to zero.  This prevents initial
4469                  values of non-zero interferring with the reduction op.  */
4470               gcc_assert (ncopies == 1);
4471               gcc_assert (i == 0);
4472
4473               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4474               tree induc_val_vec
4475                 = build_vector_from_val (vec_init_def_type, induc_val);
4476
4477               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4478                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4479             }
4480           else
4481             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4482                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4483
4484           /* Set the loop-latch arg for the reduction-phi.  */
4485           if (j > 0)
4486             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4487
4488           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4489                        UNKNOWN_LOCATION);
4490
4491           if (dump_enabled_p ())
4492             {
4493               dump_printf_loc (MSG_NOTE, vect_location,
4494                                "transform reduction: created def-use cycle: ");
4495               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4496               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4497             }
4498         }
4499     }
4500
4501   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4502      which is updated with the current index of the loop for every match of
4503      the original loop's cond_expr (VEC_STMT).  This results in a vector
4504      containing the last time the condition passed for that vector lane.
4505      The first match will be a 1 to allow 0 to be used for non-matching
4506      indexes.  If there are no matches at all then the vector will be all
4507      zeroes.  */
4508   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4509     {
4510       tree indx_before_incr, indx_after_incr;
4511       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4512       int k;
4513
4514       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4515       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4516
4517       int scalar_precision
4518         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4519       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4520       tree cr_index_vector_type = build_vector_type
4521         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4522
4523       /* First we create a simple vector induction variable which starts
4524          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4525          vector size (STEP).  */
4526
4527       /* Create a {1,2,3,...} vector.  */
4528       tree_vector_builder vtemp (cr_index_vector_type, 1, 3);
4529       for (k = 0; k < 3; ++k)
4530         vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4531       tree series_vect = vtemp.build ();
4532
4533       /* Create a vector of the step value.  */
4534       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4535       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4536
4537       /* Create an induction variable.  */
4538       gimple_stmt_iterator incr_gsi;
4539       bool insert_after;
4540       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4541       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4542                  insert_after, &indx_before_incr, &indx_after_incr);
4543
4544       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4545          filled with zeros (VEC_ZERO).  */
4546
4547       /* Create a vector of 0s.  */
4548       tree zero = build_zero_cst (cr_index_scalar_type);
4549       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4550
4551       /* Create a vector phi node.  */
4552       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4553       new_phi = create_phi_node (new_phi_tree, loop->header);
4554       set_vinfo_for_stmt (new_phi,
4555                           new_stmt_vec_info (new_phi, loop_vinfo));
4556       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4557                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4558
4559       /* Now take the condition from the loops original cond_expr
4560          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4561          every match uses values from the induction variable
4562          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4563          (NEW_PHI_TREE).
4564          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4565          the new cond_expr (INDEX_COND_EXPR).  */
4566
4567       /* Duplicate the condition from vec_stmt.  */
4568       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4569
4570       /* Create a conditional, where the condition is taken from vec_stmt
4571          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4572          else is the phi (NEW_PHI_TREE).  */
4573       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4574                                      ccompare, indx_before_incr,
4575                                      new_phi_tree);
4576       induction_index = make_ssa_name (cr_index_vector_type);
4577       gimple *index_condition = gimple_build_assign (induction_index,
4578                                                      index_cond_expr);
4579       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4580       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4581                                                         loop_vinfo);
4582       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4583       set_vinfo_for_stmt (index_condition, index_vec_info);
4584
4585       /* Update the phi with the vec cond.  */
4586       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4587                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4588     }
4589
4590   /* 2. Create epilog code.
4591         The reduction epilog code operates across the elements of the vector
4592         of partial results computed by the vectorized loop.
4593         The reduction epilog code consists of:
4594
4595         step 1: compute the scalar result in a vector (v_out2)
4596         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4597         step 3: adjust the scalar result (s_out3) if needed.
4598
4599         Step 1 can be accomplished using one the following three schemes:
4600           (scheme 1) using reduc_fn, if available.
4601           (scheme 2) using whole-vector shifts, if available.
4602           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4603                      combined.
4604
4605           The overall epilog code looks like this:
4606
4607           s_out0 = phi <s_loop>         # original EXIT_PHI
4608           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4609           v_out2 = reduce <v_out1>              # step 1
4610           s_out3 = extract_field <v_out2, 0>    # step 2
4611           s_out4 = adjust_result <s_out3>       # step 3
4612
4613           (step 3 is optional, and steps 1 and 2 may be combined).
4614           Lastly, the uses of s_out0 are replaced by s_out4.  */
4615
4616
4617   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4618          v_out1 = phi <VECT_DEF>
4619          Store them in NEW_PHIS.  */
4620
4621   exit_bb = single_exit (loop)->dest;
4622   prev_phi_info = NULL;
4623   new_phis.create (vect_defs.length ());
4624   FOR_EACH_VEC_ELT (vect_defs, i, def)
4625     {
4626       for (j = 0; j < ncopies; j++)
4627         {
4628           tree new_def = copy_ssa_name (def);
4629           phi = create_phi_node (new_def, exit_bb);
4630           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4631           if (j == 0)
4632             new_phis.quick_push (phi);
4633           else
4634             {
4635               def = vect_get_vec_def_for_stmt_copy (dt, def);
4636               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4637             }
4638
4639           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4640           prev_phi_info = vinfo_for_stmt (phi);
4641         }
4642     }
4643
4644   /* The epilogue is created for the outer-loop, i.e., for the loop being
4645      vectorized.  Create exit phis for the outer loop.  */
4646   if (double_reduc)
4647     {
4648       loop = outer_loop;
4649       exit_bb = single_exit (loop)->dest;
4650       inner_phis.create (vect_defs.length ());
4651       FOR_EACH_VEC_ELT (new_phis, i, phi)
4652         {
4653           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4654           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4655           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4656                            PHI_RESULT (phi));
4657           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4658                                                             loop_vinfo));
4659           inner_phis.quick_push (phi);
4660           new_phis[i] = outer_phi;
4661           prev_phi_info = vinfo_for_stmt (outer_phi);
4662           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4663             {
4664               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4665               new_result = copy_ssa_name (PHI_RESULT (phi));
4666               outer_phi = create_phi_node (new_result, exit_bb);
4667               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4668                                PHI_RESULT (phi));
4669               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4670                                                                 loop_vinfo));
4671               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4672               prev_phi_info = vinfo_for_stmt (outer_phi);
4673             }
4674         }
4675     }
4676
4677   exit_gsi = gsi_after_labels (exit_bb);
4678
4679   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4680          (i.e. when reduc_fn is not available) and in the final adjustment
4681          code (if needed).  Also get the original scalar reduction variable as
4682          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4683          represents a reduction pattern), the tree-code and scalar-def are
4684          taken from the original stmt that the pattern-stmt (STMT) replaces.
4685          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4686          are taken from STMT.  */
4687
4688   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4689   if (!orig_stmt)
4690     {
4691       /* Regular reduction  */
4692       orig_stmt = stmt;
4693     }
4694   else
4695     {
4696       /* Reduction pattern  */
4697       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4698       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4699       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4700     }
4701
4702   code = gimple_assign_rhs_code (orig_stmt);
4703   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4704      partial results are added and not subtracted.  */
4705   if (code == MINUS_EXPR)
4706     code = PLUS_EXPR;
4707
4708   scalar_dest = gimple_assign_lhs (orig_stmt);
4709   scalar_type = TREE_TYPE (scalar_dest);
4710   scalar_results.create (group_size);
4711   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4712   bitsize = TYPE_SIZE (scalar_type);
4713
4714   /* In case this is a reduction in an inner-loop while vectorizing an outer
4715      loop - we don't need to extract a single scalar result at the end of the
4716      inner-loop (unless it is double reduction, i.e., the use of reduction is
4717      outside the outer-loop).  The final vector of partial results will be used
4718      in the vectorized outer-loop, or reduced to a scalar result at the end of
4719      the outer-loop.  */
4720   if (nested_in_vect_loop && !double_reduc)
4721     goto vect_finalize_reduction;
4722
4723   /* SLP reduction without reduction chain, e.g.,
4724      # a1 = phi <a2, a0>
4725      # b1 = phi <b2, b0>
4726      a2 = operation (a1)
4727      b2 = operation (b1)  */
4728   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4729
4730   /* In case of reduction chain, e.g.,
4731      # a1 = phi <a3, a0>
4732      a2 = operation (a1)
4733      a3 = operation (a2),
4734
4735      we may end up with more than one vector result.  Here we reduce them to
4736      one vector.  */
4737   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4738     {
4739       tree first_vect = PHI_RESULT (new_phis[0]);
4740       gassign *new_vec_stmt = NULL;
4741       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4742       for (k = 1; k < new_phis.length (); k++)
4743         {
4744           gimple *next_phi = new_phis[k];
4745           tree second_vect = PHI_RESULT (next_phi);
4746           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4747           new_vec_stmt = gimple_build_assign (tem, code,
4748                                               first_vect, second_vect);
4749           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4750           first_vect = tem;
4751         }
4752
4753       new_phi_result = first_vect;
4754       if (new_vec_stmt)
4755         {
4756           new_phis.truncate (0);
4757           new_phis.safe_push (new_vec_stmt);
4758         }
4759     }
4760   /* Likewise if we couldn't use a single defuse cycle.  */
4761   else if (ncopies > 1)
4762     {
4763       gcc_assert (new_phis.length () == 1);
4764       tree first_vect = PHI_RESULT (new_phis[0]);
4765       gassign *new_vec_stmt = NULL;
4766       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4767       gimple *next_phi = new_phis[0];
4768       for (int k = 1; k < ncopies; ++k)
4769         {
4770           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4771           tree second_vect = PHI_RESULT (next_phi);
4772           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4773           new_vec_stmt = gimple_build_assign (tem, code,
4774                                               first_vect, second_vect);
4775           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4776           first_vect = tem;
4777         }
4778       new_phi_result = first_vect;
4779       new_phis.truncate (0);
4780       new_phis.safe_push (new_vec_stmt);
4781     }
4782   else
4783     new_phi_result = PHI_RESULT (new_phis[0]);
4784
4785   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4786       && reduc_fn != IFN_LAST)
4787     {
4788       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4789          various data values where the condition matched and another vector
4790          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4791          need to extract the last matching index (which will be the index with
4792          highest value) and use this to index into the data vector.
4793          For the case where there were no matches, the data vector will contain
4794          all default values and the index vector will be all zeros.  */
4795
4796       /* Get various versions of the type of the vector of indexes.  */
4797       tree index_vec_type = TREE_TYPE (induction_index);
4798       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4799       tree index_scalar_type = TREE_TYPE (index_vec_type);
4800       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4801         (index_vec_type);
4802
4803       /* Get an unsigned integer version of the type of the data vector.  */
4804       int scalar_precision
4805         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4806       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4807       tree vectype_unsigned = build_vector_type
4808         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4809
4810       /* First we need to create a vector (ZERO_VEC) of zeros and another
4811          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4812          can create using a MAX reduction and then expanding.
4813          In the case where the loop never made any matches, the max index will
4814          be zero.  */
4815
4816       /* Vector of {0, 0, 0,...}.  */
4817       tree zero_vec = make_ssa_name (vectype);
4818       tree zero_vec_rhs = build_zero_cst (vectype);
4819       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4820       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4821
4822       /* Find maximum value from the vector of found indexes.  */
4823       tree max_index = make_ssa_name (index_scalar_type);
4824       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4825                                                           1, induction_index);
4826       gimple_call_set_lhs (max_index_stmt, max_index);
4827       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4828
4829       /* Vector of {max_index, max_index, max_index,...}.  */
4830       tree max_index_vec = make_ssa_name (index_vec_type);
4831       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4832                                                       max_index);
4833       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4834                                                         max_index_vec_rhs);
4835       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4836
4837       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4838          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4839          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4840          otherwise.  Only one value should match, resulting in a vector
4841          (VEC_COND) with one data value and the rest zeros.
4842          In the case where the loop never made any matches, every index will
4843          match, resulting in a vector with all data values (which will all be
4844          the default value).  */
4845
4846       /* Compare the max index vector to the vector of found indexes to find
4847          the position of the max value.  */
4848       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4849       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4850                                                       induction_index,
4851                                                       max_index_vec);
4852       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4853
4854       /* Use the compare to choose either values from the data vector or
4855          zero.  */
4856       tree vec_cond = make_ssa_name (vectype);
4857       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4858                                                    vec_compare, new_phi_result,
4859                                                    zero_vec);
4860       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4861
4862       /* Finally we need to extract the data value from the vector (VEC_COND)
4863          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4864          reduction, but because this doesn't exist, we can use a MAX reduction
4865          instead.  The data value might be signed or a float so we need to cast
4866          it first.
4867          In the case where the loop never made any matches, the data values are
4868          all identical, and so will reduce down correctly.  */
4869
4870       /* Make the matched data values unsigned.  */
4871       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4872       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4873                                        vec_cond);
4874       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4875                                                         VIEW_CONVERT_EXPR,
4876                                                         vec_cond_cast_rhs);
4877       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4878
4879       /* Reduce down to a scalar value.  */
4880       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4881       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4882                                                            1, vec_cond_cast);
4883       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4884       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4885
4886       /* Convert the reduced value back to the result type and set as the
4887          result.  */
4888       gimple_seq stmts = NULL;
4889       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4890                                data_reduc);
4891       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4892       scalar_results.safe_push (new_temp);
4893     }
4894   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4895            && reduc_fn == IFN_LAST)
4896     {
4897       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4898          idx = 0;
4899          idx_val = induction_index[0];
4900          val = data_reduc[0];
4901          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4902            if (induction_index[i] > idx_val)
4903              val = data_reduc[i], idx_val = induction_index[i];
4904          return val;  */
4905
4906       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4907       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4908       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4909       unsigned HOST_WIDE_INT v_size
4910         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4911       tree idx_val = NULL_TREE, val = NULL_TREE;
4912       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4913         {
4914           tree old_idx_val = idx_val;
4915           tree old_val = val;
4916           idx_val = make_ssa_name (idx_eltype);
4917           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4918                                              build3 (BIT_FIELD_REF, idx_eltype,
4919                                                      induction_index,
4920                                                      bitsize_int (el_size),
4921                                                      bitsize_int (off)));
4922           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4923           val = make_ssa_name (data_eltype);
4924           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4925                                              build3 (BIT_FIELD_REF,
4926                                                      data_eltype,
4927                                                      new_phi_result,
4928                                                      bitsize_int (el_size),
4929                                                      bitsize_int (off)));
4930           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4931           if (off != 0)
4932             {
4933               tree new_idx_val = idx_val;
4934               tree new_val = val;
4935               if (off != v_size - el_size)
4936                 {
4937                   new_idx_val = make_ssa_name (idx_eltype);
4938                   epilog_stmt = gimple_build_assign (new_idx_val,
4939                                                      MAX_EXPR, idx_val,
4940                                                      old_idx_val);
4941                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4942                 }
4943               new_val = make_ssa_name (data_eltype);
4944               epilog_stmt = gimple_build_assign (new_val,
4945                                                  COND_EXPR,
4946                                                  build2 (GT_EXPR,
4947                                                          boolean_type_node,
4948                                                          idx_val,
4949                                                          old_idx_val),
4950                                                  val, old_val);
4951               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4952               idx_val = new_idx_val;
4953               val = new_val;
4954             }
4955         }
4956       /* Convert the reduced value back to the result type and set as the
4957          result.  */
4958       gimple_seq stmts = NULL;
4959       val = gimple_convert (&stmts, scalar_type, val);
4960       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4961       scalar_results.safe_push (val);
4962     }
4963
4964   /* 2.3 Create the reduction code, using one of the three schemes described
4965          above. In SLP we simply need to extract all the elements from the
4966          vector (without reducing them), so we use scalar shifts.  */
4967   else if (reduc_fn != IFN_LAST && !slp_reduc)
4968     {
4969       tree tmp;
4970       tree vec_elem_type;
4971
4972       /* Case 1:  Create:
4973          v_out2 = reduc_expr <v_out1>  */
4974
4975       if (dump_enabled_p ())
4976         dump_printf_loc (MSG_NOTE, vect_location,
4977                          "Reduce using direct vector reduction.\n");
4978
4979       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4980       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4981         {
4982           tree tmp_dest
4983             = vect_create_destination_var (scalar_dest, vec_elem_type);
4984           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4985                                                     new_phi_result);
4986           gimple_set_lhs (epilog_stmt, tmp_dest);
4987           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4988           gimple_set_lhs (epilog_stmt, new_temp);
4989           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4990
4991           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4992                                              new_temp);
4993         }
4994       else
4995         {
4996           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4997                                                     new_phi_result);
4998           gimple_set_lhs (epilog_stmt, new_scalar_dest);
4999         }
5000
5001       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5002       gimple_set_lhs (epilog_stmt, new_temp);
5003       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5004
5005       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5006            == INTEGER_INDUC_COND_REDUCTION)
5007           && !operand_equal_p (initial_def, induc_val, 0))
5008         {
5009           /* Earlier we set the initial value to be a vector if induc_val
5010              values.  Check the result and if it is induc_val then replace
5011              with the original initial value, unless induc_val is
5012              the same as initial_def already.  */
5013           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5014                                   induc_val);
5015
5016           tmp = make_ssa_name (new_scalar_dest);
5017           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5018                                              initial_def, new_temp);
5019           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5020           new_temp = tmp;
5021         }
5022
5023       scalar_results.safe_push (new_temp);
5024     }
5025   else
5026     {
5027       bool reduce_with_shift = have_whole_vector_shift (mode);
5028       int element_bitsize = tree_to_uhwi (bitsize);
5029       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5030       tree vec_temp;
5031
5032       /* COND reductions all do the final reduction with MAX_EXPR
5033          or MIN_EXPR.  */
5034       if (code == COND_EXPR)
5035         {
5036           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5037               == INTEGER_INDUC_COND_REDUCTION)
5038             code = induc_code;
5039           else
5040             code = MAX_EXPR;
5041         }
5042
5043       /* Regardless of whether we have a whole vector shift, if we're
5044          emulating the operation via tree-vect-generic, we don't want
5045          to use it.  Only the first round of the reduction is likely
5046          to still be profitable via emulation.  */
5047       /* ??? It might be better to emit a reduction tree code here, so that
5048          tree-vect-generic can expand the first round via bit tricks.  */
5049       if (!VECTOR_MODE_P (mode))
5050         reduce_with_shift = false;
5051       else
5052         {
5053           optab optab = optab_for_tree_code (code, vectype, optab_default);
5054           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5055             reduce_with_shift = false;
5056         }
5057
5058       if (reduce_with_shift && !slp_reduc)
5059         {
5060           int nelements = vec_size_in_bits / element_bitsize;
5061           auto_vec_perm_indices sel (nelements);
5062
5063           int elt_offset;
5064
5065           tree zero_vec = build_zero_cst (vectype);
5066           /* Case 2: Create:
5067              for (offset = nelements/2; offset >= 1; offset/=2)
5068                 {
5069                   Create:  va' = vec_shift <va, offset>
5070                   Create:  va = vop <va, va'>
5071                 }  */
5072
5073           tree rhs;
5074
5075           if (dump_enabled_p ())
5076             dump_printf_loc (MSG_NOTE, vect_location,
5077                              "Reduce using vector shifts\n");
5078
5079           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5080           new_temp = new_phi_result;
5081           for (elt_offset = nelements / 2;
5082                elt_offset >= 1;
5083                elt_offset /= 2)
5084             {
5085               sel.truncate (0);
5086               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5087               tree mask = vect_gen_perm_mask_any (vectype, sel);
5088               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5089                                                  new_temp, zero_vec, mask);
5090               new_name = make_ssa_name (vec_dest, epilog_stmt);
5091               gimple_assign_set_lhs (epilog_stmt, new_name);
5092               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5093
5094               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5095                                                  new_temp);
5096               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5097               gimple_assign_set_lhs (epilog_stmt, new_temp);
5098               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5099             }
5100
5101           /* 2.4  Extract the final scalar result.  Create:
5102              s_out3 = extract_field <v_out2, bitpos>  */
5103
5104           if (dump_enabled_p ())
5105             dump_printf_loc (MSG_NOTE, vect_location,
5106                              "extract scalar result\n");
5107
5108           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5109                         bitsize, bitsize_zero_node);
5110           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5111           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5112           gimple_assign_set_lhs (epilog_stmt, new_temp);
5113           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5114           scalar_results.safe_push (new_temp);
5115         }
5116       else
5117         {
5118           /* Case 3: Create:
5119              s = extract_field <v_out2, 0>
5120              for (offset = element_size;
5121                   offset < vector_size;
5122                   offset += element_size;)
5123                {
5124                  Create:  s' = extract_field <v_out2, offset>
5125                  Create:  s = op <s, s'>  // For non SLP cases
5126                }  */
5127
5128           if (dump_enabled_p ())
5129             dump_printf_loc (MSG_NOTE, vect_location,
5130                              "Reduce using scalar code.\n");
5131
5132           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5133           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5134             {
5135               int bit_offset;
5136               if (gimple_code (new_phi) == GIMPLE_PHI)
5137                 vec_temp = PHI_RESULT (new_phi);
5138               else
5139                 vec_temp = gimple_assign_lhs (new_phi);
5140               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5141                                  bitsize_zero_node);
5142               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5143               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5144               gimple_assign_set_lhs (epilog_stmt, new_temp);
5145               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5146
5147               /* In SLP we don't need to apply reduction operation, so we just
5148                  collect s' values in SCALAR_RESULTS.  */
5149               if (slp_reduc)
5150                 scalar_results.safe_push (new_temp);
5151
5152               for (bit_offset = element_bitsize;
5153                    bit_offset < vec_size_in_bits;
5154                    bit_offset += element_bitsize)
5155                 {
5156                   tree bitpos = bitsize_int (bit_offset);
5157                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5158                                      bitsize, bitpos);
5159
5160                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5161                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5162                   gimple_assign_set_lhs (epilog_stmt, new_name);
5163                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5164
5165                   if (slp_reduc)
5166                     {
5167                       /* In SLP we don't need to apply reduction operation, so
5168                          we just collect s' values in SCALAR_RESULTS.  */
5169                       new_temp = new_name;
5170                       scalar_results.safe_push (new_name);
5171                     }
5172                   else
5173                     {
5174                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5175                                                          new_name, new_temp);
5176                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5177                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5178                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5179                     }
5180                 }
5181             }
5182
5183           /* The only case where we need to reduce scalar results in SLP, is
5184              unrolling.  If the size of SCALAR_RESULTS is greater than
5185              GROUP_SIZE, we reduce them combining elements modulo
5186              GROUP_SIZE.  */
5187           if (slp_reduc)
5188             {
5189               tree res, first_res, new_res;
5190               gimple *new_stmt;
5191
5192               /* Reduce multiple scalar results in case of SLP unrolling.  */
5193               for (j = group_size; scalar_results.iterate (j, &res);
5194                    j++)
5195                 {
5196                   first_res = scalar_results[j % group_size];
5197                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5198                                                   first_res, res);
5199                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5200                   gimple_assign_set_lhs (new_stmt, new_res);
5201                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5202                   scalar_results[j % group_size] = new_res;
5203                 }
5204             }
5205           else
5206             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5207             scalar_results.safe_push (new_temp);
5208         }
5209
5210       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5211            == INTEGER_INDUC_COND_REDUCTION)
5212           && !operand_equal_p (initial_def, induc_val, 0))
5213         {
5214           /* Earlier we set the initial value to be a vector if induc_val
5215              values.  Check the result and if it is induc_val then replace
5216              with the original initial value, unless induc_val is
5217              the same as initial_def already.  */
5218           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5219                                   induc_val);
5220
5221           tree tmp = make_ssa_name (new_scalar_dest);
5222           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5223                                              initial_def, new_temp);
5224           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5225           scalar_results[0] = tmp;
5226         }
5227     }
5228
5229 vect_finalize_reduction:
5230
5231   if (double_reduc)
5232     loop = loop->inner;
5233
5234   /* 2.5 Adjust the final result by the initial value of the reduction
5235          variable. (When such adjustment is not needed, then
5236          'adjustment_def' is zero).  For example, if code is PLUS we create:
5237          new_temp = loop_exit_def + adjustment_def  */
5238
5239   if (adjustment_def)
5240     {
5241       gcc_assert (!slp_reduc);
5242       if (nested_in_vect_loop)
5243         {
5244           new_phi = new_phis[0];
5245           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5246           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5247           new_dest = vect_create_destination_var (scalar_dest, vectype);
5248         }
5249       else
5250         {
5251           new_temp = scalar_results[0];
5252           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5253           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5254           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5255         }
5256
5257       epilog_stmt = gimple_build_assign (new_dest, expr);
5258       new_temp = make_ssa_name (new_dest, epilog_stmt);
5259       gimple_assign_set_lhs (epilog_stmt, new_temp);
5260       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5261       if (nested_in_vect_loop)
5262         {
5263           set_vinfo_for_stmt (epilog_stmt,
5264                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5265           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5266                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5267
5268           if (!double_reduc)
5269             scalar_results.quick_push (new_temp);
5270           else
5271             scalar_results[0] = new_temp;
5272         }
5273       else
5274         scalar_results[0] = new_temp;
5275
5276       new_phis[0] = epilog_stmt;
5277     }
5278
5279   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5280           phis with new adjusted scalar results, i.e., replace use <s_out0>
5281           with use <s_out4>.
5282
5283      Transform:
5284         loop_exit:
5285           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5286           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5287           v_out2 = reduce <v_out1>
5288           s_out3 = extract_field <v_out2, 0>
5289           s_out4 = adjust_result <s_out3>
5290           use <s_out0>
5291           use <s_out0>
5292
5293      into:
5294
5295         loop_exit:
5296           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5297           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5298           v_out2 = reduce <v_out1>
5299           s_out3 = extract_field <v_out2, 0>
5300           s_out4 = adjust_result <s_out3>
5301           use <s_out4>
5302           use <s_out4> */
5303
5304
5305   /* In SLP reduction chain we reduce vector results into one vector if
5306      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5307      the last stmt in the reduction chain, since we are looking for the loop
5308      exit phi node.  */
5309   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5310     {
5311       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5312       /* Handle reduction patterns.  */
5313       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5314         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5315
5316       scalar_dest = gimple_assign_lhs (dest_stmt);
5317       group_size = 1;
5318     }
5319
5320   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5321      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5322      need to match SCALAR_RESULTS with corresponding statements.  The first
5323      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5324      the first vector stmt, etc.
5325      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5326   if (group_size > new_phis.length ())
5327     {
5328       ratio = group_size / new_phis.length ();
5329       gcc_assert (!(group_size % new_phis.length ()));
5330     }
5331   else
5332     ratio = 1;
5333
5334   for (k = 0; k < group_size; k++)
5335     {
5336       if (k % ratio == 0)
5337         {
5338           epilog_stmt = new_phis[k / ratio];
5339           reduction_phi = reduction_phis[k / ratio];
5340           if (double_reduc)
5341             inner_phi = inner_phis[k / ratio];
5342         }
5343
5344       if (slp_reduc)
5345         {
5346           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5347
5348           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5349           /* SLP statements can't participate in patterns.  */
5350           gcc_assert (!orig_stmt);
5351           scalar_dest = gimple_assign_lhs (current_stmt);
5352         }
5353
5354       phis.create (3);
5355       /* Find the loop-closed-use at the loop exit of the original scalar
5356          result.  (The reduction result is expected to have two immediate uses -
5357          one at the latch block, and one at the loop exit).  */
5358       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5359         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5360             && !is_gimple_debug (USE_STMT (use_p)))
5361           phis.safe_push (USE_STMT (use_p));
5362
5363       /* While we expect to have found an exit_phi because of loop-closed-ssa
5364          form we can end up without one if the scalar cycle is dead.  */
5365
5366       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5367         {
5368           if (outer_loop)
5369             {
5370               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5371               gphi *vect_phi;
5372
5373               /* FORNOW. Currently not supporting the case that an inner-loop
5374                  reduction is not used in the outer-loop (but only outside the
5375                  outer-loop), unless it is double reduction.  */
5376               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5377                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5378                           || double_reduc);
5379
5380               if (double_reduc)
5381                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5382               else
5383                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5384               if (!double_reduc
5385                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5386                       != vect_double_reduction_def)
5387                 continue;
5388
5389               /* Handle double reduction:
5390
5391                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5392                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5393                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5394                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5395
5396                  At that point the regular reduction (stmt2 and stmt3) is
5397                  already vectorized, as well as the exit phi node, stmt4.
5398                  Here we vectorize the phi node of double reduction, stmt1, and
5399                  update all relevant statements.  */
5400
5401               /* Go through all the uses of s2 to find double reduction phi
5402                  node, i.e., stmt1 above.  */
5403               orig_name = PHI_RESULT (exit_phi);
5404               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5405                 {
5406                   stmt_vec_info use_stmt_vinfo;
5407                   stmt_vec_info new_phi_vinfo;
5408                   tree vect_phi_init, preheader_arg, vect_phi_res;
5409                   basic_block bb = gimple_bb (use_stmt);
5410                   gimple *use;
5411
5412                   /* Check that USE_STMT is really double reduction phi
5413                      node.  */
5414                   if (gimple_code (use_stmt) != GIMPLE_PHI
5415                       || gimple_phi_num_args (use_stmt) != 2
5416                       || bb->loop_father != outer_loop)
5417                     continue;
5418                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5419                   if (!use_stmt_vinfo
5420                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5421                           != vect_double_reduction_def)
5422                     continue;
5423
5424                   /* Create vector phi node for double reduction:
5425                      vs1 = phi <vs0, vs2>
5426                      vs1 was created previously in this function by a call to
5427                        vect_get_vec_def_for_operand and is stored in
5428                        vec_initial_def;
5429                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5430                      vs0 is created here.  */
5431
5432                   /* Create vector phi node.  */
5433                   vect_phi = create_phi_node (vec_initial_def, bb);
5434                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5435                                     loop_vec_info_for_loop (outer_loop));
5436                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5437
5438                   /* Create vs0 - initial def of the double reduction phi.  */
5439                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5440                                              loop_preheader_edge (outer_loop));
5441                   vect_phi_init = get_initial_def_for_reduction
5442                     (stmt, preheader_arg, NULL);
5443
5444                   /* Update phi node arguments with vs0 and vs2.  */
5445                   add_phi_arg (vect_phi, vect_phi_init,
5446                                loop_preheader_edge (outer_loop),
5447                                UNKNOWN_LOCATION);
5448                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5449                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5450                   if (dump_enabled_p ())
5451                     {
5452                       dump_printf_loc (MSG_NOTE, vect_location,
5453                                        "created double reduction phi node: ");
5454                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5455                     }
5456
5457                   vect_phi_res = PHI_RESULT (vect_phi);
5458
5459                   /* Replace the use, i.e., set the correct vs1 in the regular
5460                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5461                      loop is redundant.  */
5462                   use = reduction_phi;
5463                   for (j = 0; j < ncopies; j++)
5464                     {
5465                       edge pr_edge = loop_preheader_edge (loop);
5466                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5467                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5468                     }
5469                 }
5470             }
5471         }
5472
5473       phis.release ();
5474       if (nested_in_vect_loop)
5475         {
5476           if (double_reduc)
5477             loop = outer_loop;
5478           else
5479             continue;
5480         }
5481
5482       phis.create (3);
5483       /* Find the loop-closed-use at the loop exit of the original scalar
5484          result.  (The reduction result is expected to have two immediate uses,
5485          one at the latch block, and one at the loop exit).  For double
5486          reductions we are looking for exit phis of the outer loop.  */
5487       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5488         {
5489           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5490             {
5491               if (!is_gimple_debug (USE_STMT (use_p)))
5492                 phis.safe_push (USE_STMT (use_p));
5493             }
5494           else
5495             {
5496               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5497                 {
5498                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5499
5500                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5501                     {
5502                       if (!flow_bb_inside_loop_p (loop,
5503                                              gimple_bb (USE_STMT (phi_use_p)))
5504                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5505                         phis.safe_push (USE_STMT (phi_use_p));
5506                     }
5507                 }
5508             }
5509         }
5510
5511       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5512         {
5513           /* Replace the uses:  */
5514           orig_name = PHI_RESULT (exit_phi);
5515           scalar_result = scalar_results[k];
5516           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5517             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5518               SET_USE (use_p, scalar_result);
5519         }
5520
5521       phis.release ();
5522     }
5523 }
5524
5525
5526 /* Function is_nonwrapping_integer_induction.
5527
5528    Check if STMT (which is part of loop LOOP) both increments and
5529    does not cause overflow.  */
5530
5531 static bool
5532 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5533 {
5534   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5535   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5536   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5537   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5538   widest_int ni, max_loop_value, lhs_max;
5539   bool overflow = false;
5540
5541   /* Make sure the loop is integer based.  */
5542   if (TREE_CODE (base) != INTEGER_CST
5543       || TREE_CODE (step) != INTEGER_CST)
5544     return false;
5545
5546   /* Check that the max size of the loop will not wrap.  */
5547
5548   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5549     return true;
5550
5551   if (! max_stmt_executions (loop, &ni))
5552     return false;
5553
5554   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5555                             &overflow);
5556   if (overflow)
5557     return false;
5558
5559   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5560                             TYPE_SIGN (lhs_type), &overflow);
5561   if (overflow)
5562     return false;
5563
5564   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5565           <= TYPE_PRECISION (lhs_type));
5566 }
5567
5568 /* Function vectorizable_reduction.
5569
5570    Check if STMT performs a reduction operation that can be vectorized.
5571    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5572    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5573    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5574
5575    This function also handles reduction idioms (patterns) that have been
5576    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5577    of this form:
5578      X = pattern_expr (arg0, arg1, ..., X)
5579    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5580    sequence that had been detected and replaced by the pattern-stmt (STMT).
5581
5582    This function also handles reduction of condition expressions, for example:
5583      for (int i = 0; i < N; i++)
5584        if (a[i] < value)
5585          last = a[i];
5586    This is handled by vectorising the loop and creating an additional vector
5587    containing the loop indexes for which "a[i] < value" was true.  In the
5588    function epilogue this is reduced to a single max value and then used to
5589    index into the vector of results.
5590
5591    In some cases of reduction patterns, the type of the reduction variable X is
5592    different than the type of the other arguments of STMT.
5593    In such cases, the vectype that is used when transforming STMT into a vector
5594    stmt is different than the vectype that is used to determine the
5595    vectorization factor, because it consists of a different number of elements
5596    than the actual number of elements that are being operated upon in parallel.
5597
5598    For example, consider an accumulation of shorts into an int accumulator.
5599    On some targets it's possible to vectorize this pattern operating on 8
5600    shorts at a time (hence, the vectype for purposes of determining the
5601    vectorization factor should be V8HI); on the other hand, the vectype that
5602    is used to create the vector form is actually V4SI (the type of the result).
5603
5604    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5605    indicates what is the actual level of parallelism (V8HI in the example), so
5606    that the right vectorization factor would be derived.  This vectype
5607    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5608    be used to create the vectorized stmt.  The right vectype for the vectorized
5609    stmt is obtained from the type of the result X:
5610         get_vectype_for_scalar_type (TREE_TYPE (X))
5611
5612    This means that, contrary to "regular" reductions (or "regular" stmts in
5613    general), the following equation:
5614       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5615    does *NOT* necessarily hold for reduction patterns.  */
5616
5617 bool
5618 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5619                         gimple **vec_stmt, slp_tree slp_node,
5620                         slp_instance slp_node_instance)
5621 {
5622   tree vec_dest;
5623   tree scalar_dest;
5624   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5625   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5626   tree vectype_in = NULL_TREE;
5627   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5628   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5629   enum tree_code code, orig_code;
5630   internal_fn reduc_fn;
5631   machine_mode vec_mode;
5632   int op_type;
5633   optab optab;
5634   tree new_temp = NULL_TREE;
5635   gimple *def_stmt;
5636   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5637   gimple *cond_reduc_def_stmt = NULL;
5638   enum tree_code cond_reduc_op_code = ERROR_MARK;
5639   tree scalar_type;
5640   bool is_simple_use;
5641   gimple *orig_stmt;
5642   stmt_vec_info orig_stmt_info = NULL;
5643   int i;
5644   int ncopies;
5645   int epilog_copies;
5646   stmt_vec_info prev_stmt_info, prev_phi_info;
5647   bool single_defuse_cycle = false;
5648   gimple *new_stmt = NULL;
5649   int j;
5650   tree ops[3];
5651   enum vect_def_type dts[3];
5652   bool nested_cycle = false, found_nested_cycle_def = false;
5653   bool double_reduc = false;
5654   basic_block def_bb;
5655   struct loop * def_stmt_loop, *outer_loop = NULL;
5656   tree def_arg;
5657   gimple *def_arg_stmt;
5658   auto_vec<tree> vec_oprnds0;
5659   auto_vec<tree> vec_oprnds1;
5660   auto_vec<tree> vec_oprnds2;
5661   auto_vec<tree> vect_defs;
5662   auto_vec<gimple *> phis;
5663   int vec_num;
5664   tree def0, tem;
5665   bool first_p = true;
5666   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5667   tree cond_reduc_val = NULL_TREE;
5668
5669   /* Make sure it was already recognized as a reduction computation.  */
5670   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5671       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5672     return false;
5673
5674   if (nested_in_vect_loop_p (loop, stmt))
5675     {
5676       outer_loop = loop;
5677       loop = loop->inner;
5678       nested_cycle = true;
5679     }
5680
5681   /* In case of reduction chain we switch to the first stmt in the chain, but
5682      we don't update STMT_INFO, since only the last stmt is marked as reduction
5683      and has reduction properties.  */
5684   if (GROUP_FIRST_ELEMENT (stmt_info)
5685       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5686     {
5687       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5688       first_p = false;
5689     }
5690
5691   if (gimple_code (stmt) == GIMPLE_PHI)
5692     {
5693       /* Analysis is fully done on the reduction stmt invocation.  */
5694       if (! vec_stmt)
5695         {
5696           if (slp_node)
5697             slp_node_instance->reduc_phis = slp_node;
5698
5699           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5700           return true;
5701         }
5702
5703       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5704       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5705         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5706
5707       gcc_assert (is_gimple_assign (reduc_stmt));
5708       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5709         {
5710           tree op = gimple_op (reduc_stmt, k);
5711           if (op == gimple_phi_result (stmt))
5712             continue;
5713           if (k == 1
5714               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5715             continue;
5716           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5717           if (! vectype_in
5718               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5719             vectype_in = tem;
5720           break;
5721         }
5722       gcc_assert (vectype_in);
5723
5724       if (slp_node)
5725         ncopies = 1;
5726       else
5727         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5728
5729       use_operand_p use_p;
5730       gimple *use_stmt;
5731       if (ncopies > 1
5732           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5733               <= vect_used_only_live)
5734           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5735           && (use_stmt == reduc_stmt
5736               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5737                   == reduc_stmt)))
5738         single_defuse_cycle = true;
5739
5740       /* Create the destination vector  */
5741       scalar_dest = gimple_assign_lhs (reduc_stmt);
5742       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5743
5744       if (slp_node)
5745         /* The size vect_schedule_slp_instance computes is off for us.  */
5746         vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5747                     * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5748                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5749       else
5750         vec_num = 1;
5751
5752       /* Generate the reduction PHIs upfront.  */
5753       prev_phi_info = NULL;
5754       for (j = 0; j < ncopies; j++)
5755         {
5756           if (j == 0 || !single_defuse_cycle)
5757             {
5758               for (i = 0; i < vec_num; i++)
5759                 {
5760                   /* Create the reduction-phi that defines the reduction
5761                      operand.  */
5762                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5763                   set_vinfo_for_stmt (new_phi,
5764                                       new_stmt_vec_info (new_phi, loop_vinfo));
5765
5766                   if (slp_node)
5767                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5768                   else
5769                     {
5770                       if (j == 0)
5771                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5772                       else
5773                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5774                       prev_phi_info = vinfo_for_stmt (new_phi);
5775                     }
5776                 }
5777             }
5778         }
5779
5780       return true;
5781     }
5782
5783   /* 1. Is vectorizable reduction?  */
5784   /* Not supportable if the reduction variable is used in the loop, unless
5785      it's a reduction chain.  */
5786   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5787       && !GROUP_FIRST_ELEMENT (stmt_info))
5788     return false;
5789
5790   /* Reductions that are not used even in an enclosing outer-loop,
5791      are expected to be "live" (used out of the loop).  */
5792   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5793       && !STMT_VINFO_LIVE_P (stmt_info))
5794     return false;
5795
5796   /* 2. Has this been recognized as a reduction pattern?
5797
5798      Check if STMT represents a pattern that has been recognized
5799      in earlier analysis stages.  For stmts that represent a pattern,
5800      the STMT_VINFO_RELATED_STMT field records the last stmt in
5801      the original sequence that constitutes the pattern.  */
5802
5803   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5804   if (orig_stmt)
5805     {
5806       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5807       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5808       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5809     }
5810
5811   /* 3. Check the operands of the operation.  The first operands are defined
5812         inside the loop body. The last operand is the reduction variable,
5813         which is defined by the loop-header-phi.  */
5814
5815   gcc_assert (is_gimple_assign (stmt));
5816
5817   /* Flatten RHS.  */
5818   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5819     {
5820     case GIMPLE_BINARY_RHS:
5821       code = gimple_assign_rhs_code (stmt);
5822       op_type = TREE_CODE_LENGTH (code);
5823       gcc_assert (op_type == binary_op);
5824       ops[0] = gimple_assign_rhs1 (stmt);
5825       ops[1] = gimple_assign_rhs2 (stmt);
5826       break;
5827
5828     case GIMPLE_TERNARY_RHS:
5829       code = gimple_assign_rhs_code (stmt);
5830       op_type = TREE_CODE_LENGTH (code);
5831       gcc_assert (op_type == ternary_op);
5832       ops[0] = gimple_assign_rhs1 (stmt);
5833       ops[1] = gimple_assign_rhs2 (stmt);
5834       ops[2] = gimple_assign_rhs3 (stmt);
5835       break;
5836
5837     case GIMPLE_UNARY_RHS:
5838       return false;
5839
5840     default:
5841       gcc_unreachable ();
5842     }
5843
5844   if (code == COND_EXPR && slp_node)
5845     return false;
5846
5847   scalar_dest = gimple_assign_lhs (stmt);
5848   scalar_type = TREE_TYPE (scalar_dest);
5849   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5850       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5851     return false;
5852
5853   /* Do not try to vectorize bit-precision reductions.  */
5854   if (!type_has_mode_precision_p (scalar_type))
5855     return false;
5856
5857   /* All uses but the last are expected to be defined in the loop.
5858      The last use is the reduction variable.  In case of nested cycle this
5859      assumption is not true: we use reduc_index to record the index of the
5860      reduction variable.  */
5861   gimple *reduc_def_stmt = NULL;
5862   int reduc_index = -1;
5863   for (i = 0; i < op_type; i++)
5864     {
5865       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5866       if (i == 0 && code == COND_EXPR)
5867         continue;
5868
5869       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5870                                           &def_stmt, &dts[i], &tem);
5871       dt = dts[i];
5872       gcc_assert (is_simple_use);
5873       if (dt == vect_reduction_def)
5874         {
5875           reduc_def_stmt = def_stmt;
5876           reduc_index = i;
5877           continue;
5878         }
5879       else if (tem)
5880         {
5881           /* To properly compute ncopies we are interested in the widest
5882              input type in case we're looking at a widening accumulation.  */
5883           if (!vectype_in
5884               || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
5885             vectype_in = tem;
5886         }
5887
5888       if (dt != vect_internal_def
5889           && dt != vect_external_def
5890           && dt != vect_constant_def
5891           && dt != vect_induction_def
5892           && !(dt == vect_nested_cycle && nested_cycle))
5893         return false;
5894
5895       if (dt == vect_nested_cycle)
5896         {
5897           found_nested_cycle_def = true;
5898           reduc_def_stmt = def_stmt;
5899           reduc_index = i;
5900         }
5901
5902       if (i == 1 && code == COND_EXPR)
5903         {
5904           /* Record how value of COND_EXPR is defined.  */
5905           if (dt == vect_constant_def)
5906             {
5907               cond_reduc_dt = dt;
5908               cond_reduc_val = ops[i];
5909             }
5910           if (dt == vect_induction_def
5911               && def_stmt != NULL
5912               && is_nonwrapping_integer_induction (def_stmt, loop))
5913             {
5914               cond_reduc_dt = dt;
5915               cond_reduc_def_stmt = def_stmt;
5916             }
5917         }
5918     }
5919
5920   if (!vectype_in)
5921     vectype_in = vectype_out;
5922
5923   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5924      directy used in stmt.  */
5925   if (reduc_index == -1)
5926     {
5927       if (orig_stmt)
5928         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5929       else
5930         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5931     }
5932
5933   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5934     return false;
5935
5936   if (!(reduc_index == -1
5937         || dts[reduc_index] == vect_reduction_def
5938         || dts[reduc_index] == vect_nested_cycle
5939         || ((dts[reduc_index] == vect_internal_def
5940              || dts[reduc_index] == vect_external_def
5941              || dts[reduc_index] == vect_constant_def
5942              || dts[reduc_index] == vect_induction_def)
5943             && nested_cycle && found_nested_cycle_def)))
5944     {
5945       /* For pattern recognized stmts, orig_stmt might be a reduction,
5946          but some helper statements for the pattern might not, or
5947          might be COND_EXPRs with reduction uses in the condition.  */
5948       gcc_assert (orig_stmt);
5949       return false;
5950     }
5951
5952   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5953   enum vect_reduction_type v_reduc_type
5954     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5955   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5956
5957   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5958   /* If we have a condition reduction, see if we can simplify it further.  */
5959   if (v_reduc_type == COND_REDUCTION)
5960     {
5961       if (cond_reduc_dt == vect_induction_def)
5962         {
5963           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
5964           tree base
5965             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
5966           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
5967
5968           gcc_assert (TREE_CODE (base) == INTEGER_CST
5969                       && TREE_CODE (step) == INTEGER_CST);
5970           cond_reduc_val = NULL_TREE;
5971           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5972              above base; punt if base is the minimum value of the type for
5973              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
5974           if (tree_int_cst_sgn (step) == -1)
5975             {
5976               cond_reduc_op_code = MIN_EXPR;
5977               if (tree_int_cst_sgn (base) == -1)
5978                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5979               else if (tree_int_cst_lt (base,
5980                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
5981                 cond_reduc_val
5982                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
5983             }
5984           else
5985             {
5986               cond_reduc_op_code = MAX_EXPR;
5987               if (tree_int_cst_sgn (base) == 1)
5988                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5989               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
5990                                         base))
5991                 cond_reduc_val
5992                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
5993             }
5994           if (cond_reduc_val)
5995             {
5996               if (dump_enabled_p ())
5997                 dump_printf_loc (MSG_NOTE, vect_location,
5998                                  "condition expression based on "
5999                                  "integer induction.\n");
6000               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6001                 = INTEGER_INDUC_COND_REDUCTION;
6002             }
6003         }
6004
6005       /* Loop peeling modifies initial value of reduction PHI, which
6006          makes the reduction stmt to be transformed different to the
6007          original stmt analyzed.  We need to record reduction code for
6008          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6009          it can be used directly at transform stage.  */
6010       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6011           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6012         {
6013           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6014           gcc_assert (cond_reduc_dt == vect_constant_def);
6015           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6016         }
6017       else if (cond_reduc_dt == vect_constant_def)
6018         {
6019           enum vect_def_type cond_initial_dt;
6020           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6021           tree cond_initial_val
6022             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6023
6024           gcc_assert (cond_reduc_val != NULL_TREE);
6025           vect_is_simple_use (cond_initial_val, loop_vinfo,
6026                               &def_stmt, &cond_initial_dt);
6027           if (cond_initial_dt == vect_constant_def
6028               && types_compatible_p (TREE_TYPE (cond_initial_val),
6029                                      TREE_TYPE (cond_reduc_val)))
6030             {
6031               tree e = fold_binary (LE_EXPR, boolean_type_node,
6032                                     cond_initial_val, cond_reduc_val);
6033               if (e && (integer_onep (e) || integer_zerop (e)))
6034                 {
6035                   if (dump_enabled_p ())
6036                     dump_printf_loc (MSG_NOTE, vect_location,
6037                                      "condition expression based on "
6038                                      "compile time constant.\n");
6039                   /* Record reduction code at analysis stage.  */
6040                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6041                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6042                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6043                     = CONST_COND_REDUCTION;
6044                 }
6045             }
6046         }
6047     }
6048
6049   if (orig_stmt)
6050     gcc_assert (tmp == orig_stmt
6051                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6052   else
6053     /* We changed STMT to be the first stmt in reduction chain, hence we
6054        check that in this case the first element in the chain is STMT.  */
6055     gcc_assert (stmt == tmp
6056                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6057
6058   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6059     return false;
6060
6061   if (slp_node)
6062     ncopies = 1;
6063   else
6064     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6065
6066   gcc_assert (ncopies >= 1);
6067
6068   vec_mode = TYPE_MODE (vectype_in);
6069
6070   if (code == COND_EXPR)
6071     {
6072       /* Only call during the analysis stage, otherwise we'll lose
6073          STMT_VINFO_TYPE.  */
6074       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6075                                                 ops[reduc_index], 0, NULL))
6076         {
6077           if (dump_enabled_p ())
6078             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6079                              "unsupported condition in reduction\n");
6080           return false;
6081         }
6082     }
6083   else
6084     {
6085       /* 4. Supportable by target?  */
6086
6087       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6088           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6089         {
6090           /* Shifts and rotates are only supported by vectorizable_shifts,
6091              not vectorizable_reduction.  */
6092           if (dump_enabled_p ())
6093             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6094                              "unsupported shift or rotation.\n");
6095           return false;
6096         }
6097
6098       /* 4.1. check support for the operation in the loop  */
6099       optab = optab_for_tree_code (code, vectype_in, optab_default);
6100       if (!optab)
6101         {
6102           if (dump_enabled_p ())
6103             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6104                              "no optab.\n");
6105
6106           return false;
6107         }
6108
6109       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6110         {
6111           if (dump_enabled_p ())
6112             dump_printf (MSG_NOTE, "op not supported by target.\n");
6113
6114           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6115               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6116             return false;
6117
6118           if (dump_enabled_p ())
6119             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6120         }
6121
6122       /* Worthwhile without SIMD support?  */
6123       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6124           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6125         {
6126           if (dump_enabled_p ())
6127             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6128                              "not worthwhile without SIMD support.\n");
6129
6130           return false;
6131         }
6132     }
6133
6134   /* 4.2. Check support for the epilog operation.
6135
6136           If STMT represents a reduction pattern, then the type of the
6137           reduction variable may be different than the type of the rest
6138           of the arguments.  For example, consider the case of accumulation
6139           of shorts into an int accumulator; The original code:
6140                         S1: int_a = (int) short_a;
6141           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6142
6143           was replaced with:
6144                         STMT: int_acc = widen_sum <short_a, int_acc>
6145
6146           This means that:
6147           1. The tree-code that is used to create the vector operation in the
6148              epilog code (that reduces the partial results) is not the
6149              tree-code of STMT, but is rather the tree-code of the original
6150              stmt from the pattern that STMT is replacing.  I.e, in the example
6151              above we want to use 'widen_sum' in the loop, but 'plus' in the
6152              epilog.
6153           2. The type (mode) we use to check available target support
6154              for the vector operation to be created in the *epilog*, is
6155              determined by the type of the reduction variable (in the example
6156              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6157              However the type (mode) we use to check available target support
6158              for the vector operation to be created *inside the loop*, is
6159              determined by the type of the other arguments to STMT (in the
6160              example we'd check this: optab_handler (widen_sum_optab,
6161              vect_short_mode)).
6162
6163           This is contrary to "regular" reductions, in which the types of all
6164           the arguments are the same as the type of the reduction variable.
6165           For "regular" reductions we can therefore use the same vector type
6166           (and also the same tree-code) when generating the epilog code and
6167           when generating the code inside the loop.  */
6168
6169   if (orig_stmt)
6170     {
6171       /* This is a reduction pattern: get the vectype from the type of the
6172          reduction variable, and get the tree-code from orig_stmt.  */
6173       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6174                   == TREE_CODE_REDUCTION);
6175       orig_code = gimple_assign_rhs_code (orig_stmt);
6176       gcc_assert (vectype_out);
6177       vec_mode = TYPE_MODE (vectype_out);
6178     }
6179   else
6180     {
6181       /* Regular reduction: use the same vectype and tree-code as used for
6182          the vector code inside the loop can be used for the epilog code. */
6183       orig_code = code;
6184
6185       if (code == MINUS_EXPR)
6186         orig_code = PLUS_EXPR;
6187
6188       /* For simple condition reductions, replace with the actual expression
6189          we want to base our reduction around.  */
6190       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6191         {
6192           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6193           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6194         }
6195       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6196                == INTEGER_INDUC_COND_REDUCTION)
6197         orig_code = cond_reduc_op_code;
6198     }
6199
6200   if (nested_cycle)
6201     {
6202       def_bb = gimple_bb (reduc_def_stmt);
6203       def_stmt_loop = def_bb->loop_father;
6204       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6205                                        loop_preheader_edge (def_stmt_loop));
6206       if (TREE_CODE (def_arg) == SSA_NAME
6207           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6208           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6209           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6210           && vinfo_for_stmt (def_arg_stmt)
6211           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6212               == vect_double_reduction_def)
6213         double_reduc = true;
6214     }
6215
6216   reduc_fn = IFN_LAST;
6217
6218   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6219     {
6220       if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6221         {
6222           if (reduc_fn != IFN_LAST
6223               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6224                                                   OPTIMIZE_FOR_SPEED))
6225             {
6226               if (dump_enabled_p ())
6227                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6228                                  "reduc op not supported by target.\n");
6229
6230               reduc_fn = IFN_LAST;
6231             }
6232         }
6233       else
6234         {
6235           if (!nested_cycle || double_reduc)
6236             {
6237               if (dump_enabled_p ())
6238                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6239                                  "no reduc code for scalar code.\n");
6240
6241               return false;
6242             }
6243         }
6244     }
6245   else
6246     {
6247       int scalar_precision
6248         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6249       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6250       cr_index_vector_type = build_vector_type
6251         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6252
6253       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6254                                           OPTIMIZE_FOR_SPEED))
6255         reduc_fn = IFN_REDUC_MAX;
6256     }
6257
6258   if ((double_reduc
6259        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6260       && ncopies > 1)
6261     {
6262       if (dump_enabled_p ())
6263         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6264                          "multiple types in double reduction or condition "
6265                          "reduction.\n");
6266       return false;
6267     }
6268
6269   /* In case of widenning multiplication by a constant, we update the type
6270      of the constant to be the type of the other operand.  We check that the
6271      constant fits the type in the pattern recognition pass.  */
6272   if (code == DOT_PROD_EXPR
6273       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6274     {
6275       if (TREE_CODE (ops[0]) == INTEGER_CST)
6276         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6277       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6278         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6279       else
6280         {
6281           if (dump_enabled_p ())
6282             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6283                              "invalid types in dot-prod\n");
6284
6285           return false;
6286         }
6287     }
6288
6289   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6290     {
6291       widest_int ni;
6292
6293       if (! max_loop_iterations (loop, &ni))
6294         {
6295           if (dump_enabled_p ())
6296             dump_printf_loc (MSG_NOTE, vect_location,
6297                              "loop count not known, cannot create cond "
6298                              "reduction.\n");
6299           return false;
6300         }
6301       /* Convert backedges to iterations.  */
6302       ni += 1;
6303
6304       /* The additional index will be the same type as the condition.  Check
6305          that the loop can fit into this less one (because we'll use up the
6306          zero slot for when there are no matches).  */
6307       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6308       if (wi::geu_p (ni, wi::to_widest (max_index)))
6309         {
6310           if (dump_enabled_p ())
6311             dump_printf_loc (MSG_NOTE, vect_location,
6312                              "loop size is greater than data size.\n");
6313           return false;
6314         }
6315     }
6316
6317   /* In case the vectorization factor (VF) is bigger than the number
6318      of elements that we can fit in a vectype (nunits), we have to generate
6319      more than one vector stmt - i.e - we need to "unroll" the
6320      vector stmt by a factor VF/nunits.  For more details see documentation
6321      in vectorizable_operation.  */
6322
6323   /* If the reduction is used in an outer loop we need to generate
6324      VF intermediate results, like so (e.g. for ncopies=2):
6325         r0 = phi (init, r0)
6326         r1 = phi (init, r1)
6327         r0 = x0 + r0;
6328         r1 = x1 + r1;
6329     (i.e. we generate VF results in 2 registers).
6330     In this case we have a separate def-use cycle for each copy, and therefore
6331     for each copy we get the vector def for the reduction variable from the
6332     respective phi node created for this copy.
6333
6334     Otherwise (the reduction is unused in the loop nest), we can combine
6335     together intermediate results, like so (e.g. for ncopies=2):
6336         r = phi (init, r)
6337         r = x0 + r;
6338         r = x1 + r;
6339    (i.e. we generate VF/2 results in a single register).
6340    In this case for each copy we get the vector def for the reduction variable
6341    from the vectorized reduction operation generated in the previous iteration.
6342
6343    This only works when we see both the reduction PHI and its only consumer
6344    in vectorizable_reduction and there are no intermediate stmts
6345    participating.  */
6346   use_operand_p use_p;
6347   gimple *use_stmt;
6348   if (ncopies > 1
6349       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6350       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6351       && (use_stmt == stmt
6352           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6353     {
6354       single_defuse_cycle = true;
6355       epilog_copies = 1;
6356     }
6357   else
6358     epilog_copies = ncopies;
6359
6360   /* If the reduction stmt is one of the patterns that have lane
6361      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6362   if ((ncopies > 1
6363        && ! single_defuse_cycle)
6364       && (code == DOT_PROD_EXPR
6365           || code == WIDEN_SUM_EXPR
6366           || code == SAD_EXPR))
6367     {
6368       if (dump_enabled_p ())
6369         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6370                          "multi def-use cycle not possible for lane-reducing "
6371                          "reduction operation\n");
6372       return false;
6373     }
6374
6375   if (!vec_stmt) /* transformation not required.  */
6376     {
6377       if (first_p)
6378         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6379       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6380       return true;
6381     }
6382
6383   /* Transform.  */
6384
6385   if (dump_enabled_p ())
6386     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6387
6388   /* FORNOW: Multiple types are not supported for condition.  */
6389   if (code == COND_EXPR)
6390     gcc_assert (ncopies == 1);
6391
6392   /* Create the destination vector  */
6393   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6394
6395   prev_stmt_info = NULL;
6396   prev_phi_info = NULL;
6397   if (slp_node)
6398     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6399   else
6400     {
6401       vec_num = 1;
6402       vec_oprnds0.create (1);
6403       vec_oprnds1.create (1);
6404       if (op_type == ternary_op)
6405         vec_oprnds2.create (1);
6406     }
6407
6408   phis.create (vec_num);
6409   vect_defs.create (vec_num);
6410   if (!slp_node)
6411     vect_defs.quick_push (NULL_TREE);
6412
6413   if (slp_node)
6414     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6415   else
6416     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6417
6418   for (j = 0; j < ncopies; j++)
6419     {
6420       if (code == COND_EXPR)
6421         {
6422           gcc_assert (!slp_node);
6423           vectorizable_condition (stmt, gsi, vec_stmt,
6424                                   PHI_RESULT (phis[0]),
6425                                   reduc_index, NULL);
6426           /* Multiple types are not supported for condition.  */
6427           break;
6428         }
6429
6430       /* Handle uses.  */
6431       if (j == 0)
6432         {
6433           if (slp_node)
6434             {
6435               /* Get vec defs for all the operands except the reduction index,
6436                  ensuring the ordering of the ops in the vector is kept.  */
6437               auto_vec<tree, 3> slp_ops;
6438               auto_vec<vec<tree>, 3> vec_defs;
6439
6440               slp_ops.quick_push (ops[0]);
6441               slp_ops.quick_push (ops[1]);
6442               if (op_type == ternary_op)
6443                 slp_ops.quick_push (ops[2]);
6444
6445               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6446
6447               vec_oprnds0.safe_splice (vec_defs[0]);
6448               vec_defs[0].release ();
6449               vec_oprnds1.safe_splice (vec_defs[1]);
6450               vec_defs[1].release ();
6451               if (op_type == ternary_op)
6452                 {
6453                   vec_oprnds2.safe_splice (vec_defs[2]);
6454                   vec_defs[2].release ();
6455                 }
6456             }
6457           else
6458             {
6459               vec_oprnds0.quick_push
6460                 (vect_get_vec_def_for_operand (ops[0], stmt));
6461               vec_oprnds1.quick_push
6462                 (vect_get_vec_def_for_operand (ops[1], stmt));
6463               if (op_type == ternary_op)
6464                 vec_oprnds2.quick_push
6465                   (vect_get_vec_def_for_operand (ops[2], stmt));
6466             }
6467         }
6468       else
6469         {
6470           if (!slp_node)
6471             {
6472               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6473
6474               if (single_defuse_cycle && reduc_index == 0)
6475                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6476               else
6477                 vec_oprnds0[0]
6478                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6479               if (single_defuse_cycle && reduc_index == 1)
6480                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6481               else
6482                 vec_oprnds1[0]
6483                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6484               if (op_type == ternary_op)
6485                 {
6486                   if (single_defuse_cycle && reduc_index == 2)
6487                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6488                   else
6489                     vec_oprnds2[0]
6490                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6491                 }
6492             }
6493         }
6494
6495       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6496         {
6497           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6498           if (op_type == ternary_op)
6499             vop[2] = vec_oprnds2[i];
6500
6501           new_temp = make_ssa_name (vec_dest, new_stmt);
6502           new_stmt = gimple_build_assign (new_temp, code,
6503                                           vop[0], vop[1], vop[2]);
6504           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6505
6506           if (slp_node)
6507             {
6508               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6509               vect_defs.quick_push (new_temp);
6510             }
6511           else
6512             vect_defs[0] = new_temp;
6513         }
6514
6515       if (slp_node)
6516         continue;
6517
6518       if (j == 0)
6519         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6520       else
6521         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6522
6523       prev_stmt_info = vinfo_for_stmt (new_stmt);
6524     }
6525
6526   /* Finalize the reduction-phi (set its arguments) and create the
6527      epilog reduction code.  */
6528   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6529     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6530
6531   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6532                                     epilog_copies, reduc_fn, phis,
6533                                     double_reduc, slp_node, slp_node_instance,
6534                                     cond_reduc_val, cond_reduc_op_code);
6535
6536   return true;
6537 }
6538
6539 /* Function vect_min_worthwhile_factor.
6540
6541    For a loop where we could vectorize the operation indicated by CODE,
6542    return the minimum vectorization factor that makes it worthwhile
6543    to use generic vectors.  */
6544 int
6545 vect_min_worthwhile_factor (enum tree_code code)
6546 {
6547   switch (code)
6548     {
6549     case PLUS_EXPR:
6550     case MINUS_EXPR:
6551     case NEGATE_EXPR:
6552       return 4;
6553
6554     case BIT_AND_EXPR:
6555     case BIT_IOR_EXPR:
6556     case BIT_XOR_EXPR:
6557     case BIT_NOT_EXPR:
6558       return 2;
6559
6560     default:
6561       return INT_MAX;
6562     }
6563 }
6564
6565 /* Return true if VINFO indicates we are doing loop vectorization and if
6566    it is worth decomposing CODE operations into scalar operations for
6567    that loop's vectorization factor.  */
6568
6569 bool
6570 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6571 {
6572   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6573   return (loop_vinfo
6574           && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6575               >= vect_min_worthwhile_factor (code)));
6576 }
6577
6578 /* Function vectorizable_induction
6579
6580    Check if PHI performs an induction computation that can be vectorized.
6581    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6582    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6583    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6584
6585 bool
6586 vectorizable_induction (gimple *phi,
6587                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6588                         gimple **vec_stmt, slp_tree slp_node)
6589 {
6590   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6591   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6592   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6593   unsigned ncopies;
6594   bool nested_in_vect_loop = false;
6595   struct loop *iv_loop;
6596   tree vec_def;
6597   edge pe = loop_preheader_edge (loop);
6598   basic_block new_bb;
6599   tree new_vec, vec_init, vec_step, t;
6600   tree new_name;
6601   gimple *new_stmt;
6602   gphi *induction_phi;
6603   tree induc_def, vec_dest;
6604   tree init_expr, step_expr;
6605   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6606   unsigned i;
6607   tree expr;
6608   gimple_seq stmts;
6609   imm_use_iterator imm_iter;
6610   use_operand_p use_p;
6611   gimple *exit_phi;
6612   edge latch_e;
6613   tree loop_arg;
6614   gimple_stmt_iterator si;
6615   basic_block bb = gimple_bb (phi);
6616
6617   if (gimple_code (phi) != GIMPLE_PHI)
6618     return false;
6619
6620   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6621     return false;
6622
6623   /* Make sure it was recognized as induction computation.  */
6624   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6625     return false;
6626
6627   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6628   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6629
6630   if (slp_node)
6631     ncopies = 1;
6632   else
6633     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6634   gcc_assert (ncopies >= 1);
6635
6636   /* FORNOW. These restrictions should be relaxed.  */
6637   if (nested_in_vect_loop_p (loop, phi))
6638     {
6639       imm_use_iterator imm_iter;
6640       use_operand_p use_p;
6641       gimple *exit_phi;
6642       edge latch_e;
6643       tree loop_arg;
6644
6645       if (ncopies > 1)
6646         {
6647           if (dump_enabled_p ())
6648             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6649                              "multiple types in nested loop.\n");
6650           return false;
6651         }
6652
6653       /* FORNOW: outer loop induction with SLP not supported.  */
6654       if (STMT_SLP_TYPE (stmt_info))
6655         return false;
6656
6657       exit_phi = NULL;
6658       latch_e = loop_latch_edge (loop->inner);
6659       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6660       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6661         {
6662           gimple *use_stmt = USE_STMT (use_p);
6663           if (is_gimple_debug (use_stmt))
6664             continue;
6665
6666           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6667             {
6668               exit_phi = use_stmt;
6669               break;
6670             }
6671         }
6672       if (exit_phi)
6673         {
6674           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6675           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6676                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6677             {
6678               if (dump_enabled_p ())
6679                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6680                                  "inner-loop induction only used outside "
6681                                  "of the outer vectorized loop.\n");
6682               return false;
6683             }
6684         }
6685
6686       nested_in_vect_loop = true;
6687       iv_loop = loop->inner;
6688     }
6689   else
6690     iv_loop = loop;
6691   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6692
6693   if (!vec_stmt) /* transformation not required.  */
6694     {
6695       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6696       if (dump_enabled_p ())
6697         dump_printf_loc (MSG_NOTE, vect_location,
6698                          "=== vectorizable_induction ===\n");
6699       vect_model_induction_cost (stmt_info, ncopies);
6700       return true;
6701     }
6702
6703   /* Transform.  */
6704
6705   /* Compute a vector variable, initialized with the first VF values of
6706      the induction variable.  E.g., for an iv with IV_PHI='X' and
6707      evolution S, for a vector of 4 units, we want to compute:
6708      [X, X + S, X + 2*S, X + 3*S].  */
6709
6710   if (dump_enabled_p ())
6711     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6712
6713   latch_e = loop_latch_edge (iv_loop);
6714   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6715
6716   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6717   gcc_assert (step_expr != NULL_TREE);
6718
6719   pe = loop_preheader_edge (iv_loop);
6720   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6721                                      loop_preheader_edge (iv_loop));
6722
6723   /* Convert the step to the desired type.  */
6724   stmts = NULL;
6725   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6726   if (stmts)
6727     {
6728       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6729       gcc_assert (!new_bb);
6730     }
6731
6732   /* Find the first insertion point in the BB.  */
6733   si = gsi_after_labels (bb);
6734
6735   /* For SLP induction we have to generate several IVs as for example
6736      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6737      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6738      [VF*S, VF*S, VF*S, VF*S] for all.  */
6739   if (slp_node)
6740     {
6741       /* Convert the init to the desired type.  */
6742       stmts = NULL;
6743       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6744       if (stmts)
6745         {
6746           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6747           gcc_assert (!new_bb);
6748         }
6749
6750       /* Generate [VF*S, VF*S, ... ].  */
6751       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6752         {
6753           expr = build_int_cst (integer_type_node, vf);
6754           expr = fold_convert (TREE_TYPE (step_expr), expr);
6755         }
6756       else
6757         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6758       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6759                               expr, step_expr);
6760       if (! CONSTANT_CLASS_P (new_name))
6761         new_name = vect_init_vector (phi, new_name,
6762                                      TREE_TYPE (step_expr), NULL);
6763       new_vec = build_vector_from_val (vectype, new_name);
6764       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6765
6766       /* Now generate the IVs.  */
6767       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6768       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6769       unsigned elts = nunits * nvects;
6770       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6771       gcc_assert (elts % group_size == 0);
6772       tree elt = init_expr;
6773       unsigned ivn;
6774       for (ivn = 0; ivn < nivs; ++ivn)
6775         {
6776           tree_vector_builder elts (vectype, nunits, 1);
6777           stmts = NULL;
6778           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6779             {
6780               if (ivn*nunits + eltn >= group_size
6781                   && (ivn*nunits + eltn) % group_size == 0)
6782                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6783                                     elt, step_expr);
6784               elts.quick_push (elt);
6785             }
6786           vec_init = gimple_build_vector (&stmts, &elts);
6787           if (stmts)
6788             {
6789               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6790               gcc_assert (!new_bb);
6791             }
6792
6793           /* Create the induction-phi that defines the induction-operand.  */
6794           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6795           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6796           set_vinfo_for_stmt (induction_phi,
6797                               new_stmt_vec_info (induction_phi, loop_vinfo));
6798           induc_def = PHI_RESULT (induction_phi);
6799
6800           /* Create the iv update inside the loop  */
6801           vec_def = make_ssa_name (vec_dest);
6802           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6803           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6804           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6805
6806           /* Set the arguments of the phi node:  */
6807           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6808           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6809                        UNKNOWN_LOCATION);
6810
6811           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6812         }
6813
6814       /* Re-use IVs when we can.  */
6815       if (ivn < nvects)
6816         {
6817           unsigned vfp
6818             = least_common_multiple (group_size, nunits) / group_size;
6819           /* Generate [VF'*S, VF'*S, ... ].  */
6820           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6821             {
6822               expr = build_int_cst (integer_type_node, vfp);
6823               expr = fold_convert (TREE_TYPE (step_expr), expr);
6824             }
6825           else
6826             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6827           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6828                                   expr, step_expr);
6829           if (! CONSTANT_CLASS_P (new_name))
6830             new_name = vect_init_vector (phi, new_name,
6831                                          TREE_TYPE (step_expr), NULL);
6832           new_vec = build_vector_from_val (vectype, new_name);
6833           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6834           for (; ivn < nvects; ++ivn)
6835             {
6836               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6837               tree def;
6838               if (gimple_code (iv) == GIMPLE_PHI)
6839                 def = gimple_phi_result (iv);
6840               else
6841                 def = gimple_assign_lhs (iv);
6842               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6843                                               PLUS_EXPR,
6844                                               def, vec_step);
6845               if (gimple_code (iv) == GIMPLE_PHI)
6846                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6847               else
6848                 {
6849                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6850                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6851                 }
6852               set_vinfo_for_stmt (new_stmt,
6853                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6854               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6855             }
6856         }
6857
6858       return true;
6859     }
6860
6861   /* Create the vector that holds the initial_value of the induction.  */
6862   if (nested_in_vect_loop)
6863     {
6864       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6865          been created during vectorization of previous stmts.  We obtain it
6866          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6867       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6868       /* If the initial value is not of proper type, convert it.  */
6869       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6870         {
6871           new_stmt
6872             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6873                                                           vect_simple_var,
6874                                                           "vec_iv_"),
6875                                    VIEW_CONVERT_EXPR,
6876                                    build1 (VIEW_CONVERT_EXPR, vectype,
6877                                            vec_init));
6878           vec_init = gimple_assign_lhs (new_stmt);
6879           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6880                                                  new_stmt);
6881           gcc_assert (!new_bb);
6882           set_vinfo_for_stmt (new_stmt,
6883                               new_stmt_vec_info (new_stmt, loop_vinfo));
6884         }
6885     }
6886   else
6887     {
6888       /* iv_loop is the loop to be vectorized. Create:
6889          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6890       stmts = NULL;
6891       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6892
6893       tree_vector_builder elts (vectype, nunits, 1);
6894       elts.quick_push (new_name);
6895       for (i = 1; i < nunits; i++)
6896         {
6897           /* Create: new_name_i = new_name + step_expr  */
6898           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6899                                    new_name, step_expr);
6900           elts.quick_push (new_name);
6901         }
6902       /* Create a vector from [new_name_0, new_name_1, ...,
6903          new_name_nunits-1]  */
6904       vec_init = gimple_build_vector (&stmts, &elts);
6905       if (stmts)
6906         {
6907           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6908           gcc_assert (!new_bb);
6909         }
6910     }
6911
6912
6913   /* Create the vector that holds the step of the induction.  */
6914   if (nested_in_vect_loop)
6915     /* iv_loop is nested in the loop to be vectorized. Generate:
6916        vec_step = [S, S, S, S]  */
6917     new_name = step_expr;
6918   else
6919     {
6920       /* iv_loop is the loop to be vectorized. Generate:
6921           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6922       gimple_seq seq = NULL;
6923       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6924         {
6925           expr = build_int_cst (integer_type_node, vf);
6926           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6927         }
6928       else
6929         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6930       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6931                                expr, step_expr);
6932       if (seq)
6933         {
6934           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6935           gcc_assert (!new_bb);
6936         }
6937     }
6938
6939   t = unshare_expr (new_name);
6940   gcc_assert (CONSTANT_CLASS_P (new_name)
6941               || TREE_CODE (new_name) == SSA_NAME);
6942   new_vec = build_vector_from_val (vectype, t);
6943   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6944
6945
6946   /* Create the following def-use cycle:
6947      loop prolog:
6948          vec_init = ...
6949          vec_step = ...
6950      loop:
6951          vec_iv = PHI <vec_init, vec_loop>
6952          ...
6953          STMT
6954          ...
6955          vec_loop = vec_iv + vec_step;  */
6956
6957   /* Create the induction-phi that defines the induction-operand.  */
6958   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6959   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6960   set_vinfo_for_stmt (induction_phi,
6961                       new_stmt_vec_info (induction_phi, loop_vinfo));
6962   induc_def = PHI_RESULT (induction_phi);
6963
6964   /* Create the iv update inside the loop  */
6965   vec_def = make_ssa_name (vec_dest);
6966   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6967   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6968   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6969
6970   /* Set the arguments of the phi node:  */
6971   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6972   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6973                UNKNOWN_LOCATION);
6974
6975   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6976
6977   /* In case that vectorization factor (VF) is bigger than the number
6978      of elements that we can fit in a vectype (nunits), we have to generate
6979      more than one vector stmt - i.e - we need to "unroll" the
6980      vector stmt by a factor VF/nunits.  For more details see documentation
6981      in vectorizable_operation.  */
6982
6983   if (ncopies > 1)
6984     {
6985       gimple_seq seq = NULL;
6986       stmt_vec_info prev_stmt_vinfo;
6987       /* FORNOW. This restriction should be relaxed.  */
6988       gcc_assert (!nested_in_vect_loop);
6989
6990       /* Create the vector that holds the step of the induction.  */
6991       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6992         {
6993           expr = build_int_cst (integer_type_node, nunits);
6994           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6995         }
6996       else
6997         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6998       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6999                                expr, step_expr);
7000       if (seq)
7001         {
7002           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7003           gcc_assert (!new_bb);
7004         }
7005
7006       t = unshare_expr (new_name);
7007       gcc_assert (CONSTANT_CLASS_P (new_name)
7008                   || TREE_CODE (new_name) == SSA_NAME);
7009       new_vec = build_vector_from_val (vectype, t);
7010       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7011
7012       vec_def = induc_def;
7013       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7014       for (i = 1; i < ncopies; i++)
7015         {
7016           /* vec_i = vec_prev + vec_step  */
7017           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7018                                           vec_def, vec_step);
7019           vec_def = make_ssa_name (vec_dest, new_stmt);
7020           gimple_assign_set_lhs (new_stmt, vec_def);
7021
7022           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7023           set_vinfo_for_stmt (new_stmt,
7024                               new_stmt_vec_info (new_stmt, loop_vinfo));
7025           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7026           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7027         }
7028     }
7029
7030   if (nested_in_vect_loop)
7031     {
7032       /* Find the loop-closed exit-phi of the induction, and record
7033          the final vector of induction results:  */
7034       exit_phi = NULL;
7035       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7036         {
7037           gimple *use_stmt = USE_STMT (use_p);
7038           if (is_gimple_debug (use_stmt))
7039             continue;
7040
7041           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7042             {
7043               exit_phi = use_stmt;
7044               break;
7045             }
7046         }
7047       if (exit_phi)
7048         {
7049           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7050           /* FORNOW. Currently not supporting the case that an inner-loop induction
7051              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7052           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7053                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7054
7055           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7056           if (dump_enabled_p ())
7057             {
7058               dump_printf_loc (MSG_NOTE, vect_location,
7059                                "vector of inductions after inner-loop:");
7060               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7061             }
7062         }
7063     }
7064
7065
7066   if (dump_enabled_p ())
7067     {
7068       dump_printf_loc (MSG_NOTE, vect_location,
7069                        "transform induction: created def-use cycle: ");
7070       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7071       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7072                         SSA_NAME_DEF_STMT (vec_def), 0);
7073     }
7074
7075   return true;
7076 }
7077
7078 /* Function vectorizable_live_operation.
7079
7080    STMT computes a value that is used outside the loop.  Check if
7081    it can be supported.  */
7082
7083 bool
7084 vectorizable_live_operation (gimple *stmt,
7085                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7086                              slp_tree slp_node, int slp_index,
7087                              gimple **vec_stmt)
7088 {
7089   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7090   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7091   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7092   imm_use_iterator imm_iter;
7093   tree lhs, lhs_type, bitsize, vec_bitsize;
7094   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7095   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7096   int ncopies;
7097   gimple *use_stmt;
7098   auto_vec<tree> vec_oprnds;
7099
7100   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7101
7102   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7103     return false;
7104
7105   /* FORNOW.  CHECKME.  */
7106   if (nested_in_vect_loop_p (loop, stmt))
7107     return false;
7108
7109   /* If STMT is not relevant and it is a simple assignment and its inputs are
7110      invariant then it can remain in place, unvectorized.  The original last
7111      scalar value that it computes will be used.  */
7112   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7113     {
7114       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7115       if (dump_enabled_p ())
7116         dump_printf_loc (MSG_NOTE, vect_location,
7117                          "statement is simple and uses invariant.  Leaving in "
7118                          "place.\n");
7119       return true;
7120     }
7121
7122   if (slp_node)
7123     ncopies = 1;
7124   else
7125     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7126
7127   if (!vec_stmt)
7128     /* No transformation required.  */
7129     return true;
7130
7131   /* If stmt has a related stmt, then use that for getting the lhs.  */
7132   if (is_pattern_stmt_p (stmt_info))
7133     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7134
7135   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7136         : gimple_get_lhs (stmt);
7137   lhs_type = TREE_TYPE (lhs);
7138
7139   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7140              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7141              : TYPE_SIZE (TREE_TYPE (vectype)));
7142   vec_bitsize = TYPE_SIZE (vectype);
7143
7144   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7145   tree vec_lhs, bitstart;
7146   if (slp_node)
7147     {
7148       gcc_assert (slp_index >= 0);
7149
7150       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7151       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7152
7153       /* Get the last occurrence of the scalar index from the concatenation of
7154          all the slp vectors. Calculate which slp vector it is and the index
7155          within.  */
7156       int pos = (num_vec * nunits) - num_scalar + slp_index;
7157       int vec_entry = pos / nunits;
7158       int vec_index = pos % nunits;
7159
7160       /* Get the correct slp vectorized stmt.  */
7161       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7162
7163       /* Get entry to use.  */
7164       bitstart = bitsize_int (vec_index);
7165       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7166     }
7167   else
7168     {
7169       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7170       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7171
7172       /* For multiple copies, get the last copy.  */
7173       for (int i = 1; i < ncopies; ++i)
7174         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7175                                                   vec_lhs);
7176
7177       /* Get the last lane in the vector.  */
7178       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7179     }
7180
7181   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7182      loop.  */
7183   gimple_seq stmts = NULL;
7184   tree bftype = TREE_TYPE (vectype);
7185   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7186     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7187   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7188   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7189                                    true, NULL_TREE);
7190   if (stmts)
7191     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7192
7193   /* Replace use of lhs with newly computed result.  If the use stmt is a
7194      single arg PHI, just replace all uses of PHI result.  It's necessary
7195      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7196   use_operand_p use_p;
7197   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7198     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7199         && !is_gimple_debug (use_stmt))
7200     {
7201       if (gimple_code (use_stmt) == GIMPLE_PHI
7202           && gimple_phi_num_args (use_stmt) == 1)
7203         {
7204           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7205         }
7206       else
7207         {
7208           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7209             SET_USE (use_p, new_tree);
7210         }
7211       update_stmt (use_stmt);
7212     }
7213
7214   return true;
7215 }
7216
7217 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7218
7219 static void
7220 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7221 {
7222   ssa_op_iter op_iter;
7223   imm_use_iterator imm_iter;
7224   def_operand_p def_p;
7225   gimple *ustmt;
7226
7227   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7228     {
7229       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7230         {
7231           basic_block bb;
7232
7233           if (!is_gimple_debug (ustmt))
7234             continue;
7235
7236           bb = gimple_bb (ustmt);
7237
7238           if (!flow_bb_inside_loop_p (loop, bb))
7239             {
7240               if (gimple_debug_bind_p (ustmt))
7241                 {
7242                   if (dump_enabled_p ())
7243                     dump_printf_loc (MSG_NOTE, vect_location,
7244                                      "killing debug use\n");
7245
7246                   gimple_debug_bind_reset_value (ustmt);
7247                   update_stmt (ustmt);
7248                 }
7249               else
7250                 gcc_unreachable ();
7251             }
7252         }
7253     }
7254 }
7255
7256 /* Given loop represented by LOOP_VINFO, return true if computation of
7257    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7258    otherwise.  */
7259
7260 static bool
7261 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7262 {
7263   /* Constant case.  */
7264   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7265     {
7266       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7267       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7268
7269       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7270       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7271       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7272         return true;
7273     }
7274
7275   widest_int max;
7276   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7277   /* Check the upper bound of loop niters.  */
7278   if (get_max_loop_iterations (loop, &max))
7279     {
7280       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7281       signop sgn = TYPE_SIGN (type);
7282       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7283       if (max < type_max)
7284         return true;
7285     }
7286   return false;
7287 }
7288
7289 /* Scale profiling counters by estimation for LOOP which is vectorized
7290    by factor VF.  */
7291
7292 static void
7293 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7294 {
7295   edge preheader = loop_preheader_edge (loop);
7296   /* Reduce loop iterations by the vectorization factor.  */
7297   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7298   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7299
7300   if (freq_h.nonzero_p ())
7301     {
7302       profile_probability p;
7303
7304       /* Avoid dropping loop body profile counter to 0 because of zero count
7305          in loop's preheader.  */
7306       if (!(freq_e == profile_count::zero ()))
7307         freq_e = freq_e.force_nonzero ();
7308       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7309       scale_loop_frequencies (loop, p);
7310     }
7311
7312   edge exit_e = single_exit (loop);
7313   exit_e->probability = profile_probability::always ()
7314                                  .apply_scale (1, new_est_niter + 1);
7315
7316   edge exit_l = single_pred_edge (loop->latch);
7317   profile_probability prob = exit_l->probability;
7318   exit_l->probability = exit_e->probability.invert ();
7319   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7320     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7321 }
7322
7323 /* Function vect_transform_loop.
7324
7325    The analysis phase has determined that the loop is vectorizable.
7326    Vectorize the loop - created vectorized stmts to replace the scalar
7327    stmts in the loop, and update the loop exit condition.
7328    Returns scalar epilogue loop if any.  */
7329
7330 struct loop *
7331 vect_transform_loop (loop_vec_info loop_vinfo)
7332 {
7333   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7334   struct loop *epilogue = NULL;
7335   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7336   int nbbs = loop->num_nodes;
7337   int i;
7338   tree niters_vector = NULL;
7339   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7340   bool grouped_store;
7341   bool slp_scheduled = false;
7342   gimple *stmt, *pattern_stmt;
7343   gimple_seq pattern_def_seq = NULL;
7344   gimple_stmt_iterator pattern_def_si = gsi_none ();
7345   bool transform_pattern_stmt = false;
7346   bool check_profitability = false;
7347   int th;
7348
7349   if (dump_enabled_p ())
7350     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7351
7352   /* Use the more conservative vectorization threshold.  If the number
7353      of iterations is constant assume the cost check has been performed
7354      by our caller.  If the threshold makes all loops profitable that
7355      run at least the vectorization factor number of times checking
7356      is pointless, too.  */
7357   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7358   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7359       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7360     {
7361       if (dump_enabled_p ())
7362         dump_printf_loc (MSG_NOTE, vect_location,
7363                          "Profitability threshold is %d loop iterations.\n",
7364                          th);
7365       check_profitability = true;
7366     }
7367
7368   /* Make sure there exists a single-predecessor exit bb.  Do this before
7369      versioning.   */
7370   edge e = single_exit (loop);
7371   if (! single_pred_p (e->dest))
7372     {
7373       split_loop_exit_edge (e);
7374       if (dump_enabled_p ())
7375         dump_printf (MSG_NOTE, "split exit edge\n");
7376     }
7377
7378   /* Version the loop first, if required, so the profitability check
7379      comes first.  */
7380
7381   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7382     {
7383       vect_loop_versioning (loop_vinfo, th, check_profitability);
7384       check_profitability = false;
7385     }
7386
7387   /* Make sure there exists a single-predecessor exit bb also on the
7388      scalar loop copy.  Do this after versioning but before peeling
7389      so CFG structure is fine for both scalar and if-converted loop
7390      to make slpeel_duplicate_current_defs_from_edges face matched
7391      loop closed PHI nodes on the exit.  */
7392   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7393     {
7394       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7395       if (! single_pred_p (e->dest))
7396         {
7397           split_loop_exit_edge (e);
7398           if (dump_enabled_p ())
7399             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7400         }
7401     }
7402
7403   tree niters = vect_build_loop_niters (loop_vinfo);
7404   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7405   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7406   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7407   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7408                               check_profitability, niters_no_overflow);
7409   if (niters_vector == NULL_TREE)
7410     {
7411       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7412         niters_vector
7413           = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7414                            LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7415       else
7416         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7417                                      niters_no_overflow);
7418     }
7419
7420   /* 1) Make sure the loop header has exactly two entries
7421      2) Make sure we have a preheader basic block.  */
7422
7423   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7424
7425   split_edge (loop_preheader_edge (loop));
7426
7427   /* FORNOW: the vectorizer supports only loops which body consist
7428      of one basic block (header + empty latch). When the vectorizer will
7429      support more involved loop forms, the order by which the BBs are
7430      traversed need to be reconsidered.  */
7431
7432   for (i = 0; i < nbbs; i++)
7433     {
7434       basic_block bb = bbs[i];
7435       stmt_vec_info stmt_info;
7436
7437       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7438            gsi_next (&si))
7439         {
7440           gphi *phi = si.phi ();
7441           if (dump_enabled_p ())
7442             {
7443               dump_printf_loc (MSG_NOTE, vect_location,
7444                                "------>vectorizing phi: ");
7445               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7446             }
7447           stmt_info = vinfo_for_stmt (phi);
7448           if (!stmt_info)
7449             continue;
7450
7451           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7452             vect_loop_kill_debug_uses (loop, phi);
7453
7454           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7455               && !STMT_VINFO_LIVE_P (stmt_info))
7456             continue;
7457
7458           if (STMT_VINFO_VECTYPE (stmt_info)
7459               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7460                   != (unsigned HOST_WIDE_INT) vf)
7461               && dump_enabled_p ())
7462             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7463
7464           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7465                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7466                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7467               && ! PURE_SLP_STMT (stmt_info))
7468             {
7469               if (dump_enabled_p ())
7470                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7471               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7472             }
7473         }
7474
7475       pattern_stmt = NULL;
7476       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7477            !gsi_end_p (si) || transform_pattern_stmt;)
7478         {
7479           bool is_store;
7480
7481           if (transform_pattern_stmt)
7482             stmt = pattern_stmt;
7483           else
7484             {
7485               stmt = gsi_stmt (si);
7486               /* During vectorization remove existing clobber stmts.  */
7487               if (gimple_clobber_p (stmt))
7488                 {
7489                   unlink_stmt_vdef (stmt);
7490                   gsi_remove (&si, true);
7491                   release_defs (stmt);
7492                   continue;
7493                 }
7494             }
7495
7496           if (dump_enabled_p ())
7497             {
7498               dump_printf_loc (MSG_NOTE, vect_location,
7499                                "------>vectorizing statement: ");
7500               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7501             }
7502
7503           stmt_info = vinfo_for_stmt (stmt);
7504
7505           /* vector stmts created in the outer-loop during vectorization of
7506              stmts in an inner-loop may not have a stmt_info, and do not
7507              need to be vectorized.  */
7508           if (!stmt_info)
7509             {
7510               gsi_next (&si);
7511               continue;
7512             }
7513
7514           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7515             vect_loop_kill_debug_uses (loop, stmt);
7516
7517           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7518               && !STMT_VINFO_LIVE_P (stmt_info))
7519             {
7520               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7521                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7522                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7523                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7524                 {
7525                   stmt = pattern_stmt;
7526                   stmt_info = vinfo_for_stmt (stmt);
7527                 }
7528               else
7529                 {
7530                   gsi_next (&si);
7531                   continue;
7532                 }
7533             }
7534           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7535                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7536                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7537                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7538             transform_pattern_stmt = true;
7539
7540           /* If pattern statement has def stmts, vectorize them too.  */
7541           if (is_pattern_stmt_p (stmt_info))
7542             {
7543               if (pattern_def_seq == NULL)
7544                 {
7545                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7546                   pattern_def_si = gsi_start (pattern_def_seq);
7547                 }
7548               else if (!gsi_end_p (pattern_def_si))
7549                 gsi_next (&pattern_def_si);
7550               if (pattern_def_seq != NULL)
7551                 {
7552                   gimple *pattern_def_stmt = NULL;
7553                   stmt_vec_info pattern_def_stmt_info = NULL;
7554
7555                   while (!gsi_end_p (pattern_def_si))
7556                     {
7557                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7558                       pattern_def_stmt_info
7559                         = vinfo_for_stmt (pattern_def_stmt);
7560                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7561                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7562                         break;
7563                       gsi_next (&pattern_def_si);
7564                     }
7565
7566                   if (!gsi_end_p (pattern_def_si))
7567                     {
7568                       if (dump_enabled_p ())
7569                         {
7570                           dump_printf_loc (MSG_NOTE, vect_location,
7571                                            "==> vectorizing pattern def "
7572                                            "stmt: ");
7573                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7574                                             pattern_def_stmt, 0);
7575                         }
7576
7577                       stmt = pattern_def_stmt;
7578                       stmt_info = pattern_def_stmt_info;
7579                     }
7580                   else
7581                     {
7582                       pattern_def_si = gsi_none ();
7583                       transform_pattern_stmt = false;
7584                     }
7585                 }
7586               else
7587                 transform_pattern_stmt = false;
7588             }
7589
7590           if (STMT_VINFO_VECTYPE (stmt_info))
7591             {
7592               unsigned int nunits
7593                 = (unsigned int)
7594                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7595               if (!STMT_SLP_TYPE (stmt_info)
7596                   && nunits != (unsigned int) vf
7597                   && dump_enabled_p ())
7598                   /* For SLP VF is set according to unrolling factor, and not
7599                      to vector size, hence for SLP this print is not valid.  */
7600                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7601             }
7602
7603           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7604              reached.  */
7605           if (STMT_SLP_TYPE (stmt_info))
7606             {
7607               if (!slp_scheduled)
7608                 {
7609                   slp_scheduled = true;
7610
7611                   if (dump_enabled_p ())
7612                     dump_printf_loc (MSG_NOTE, vect_location,
7613                                      "=== scheduling SLP instances ===\n");
7614
7615                   vect_schedule_slp (loop_vinfo);
7616                 }
7617
7618               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7619               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7620                 {
7621                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7622                     {
7623                       pattern_def_seq = NULL;
7624                       gsi_next (&si);
7625                     }
7626                   continue;
7627                 }
7628             }
7629
7630           /* -------- vectorize statement ------------ */
7631           if (dump_enabled_p ())
7632             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7633
7634           grouped_store = false;
7635           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7636           if (is_store)
7637             {
7638               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7639                 {
7640                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7641                      interleaving chain was completed - free all the stores in
7642                      the chain.  */
7643                   gsi_next (&si);
7644                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7645                 }
7646               else
7647                 {
7648                   /* Free the attached stmt_vec_info and remove the stmt.  */
7649                   gimple *store = gsi_stmt (si);
7650                   free_stmt_vec_info (store);
7651                   unlink_stmt_vdef (store);
7652                   gsi_remove (&si, true);
7653                   release_defs (store);
7654                 }
7655
7656               /* Stores can only appear at the end of pattern statements.  */
7657               gcc_assert (!transform_pattern_stmt);
7658               pattern_def_seq = NULL;
7659             }
7660           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7661             {
7662               pattern_def_seq = NULL;
7663               gsi_next (&si);
7664             }
7665         }                       /* stmts in BB */
7666     }                           /* BBs in loop */
7667
7668   slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7669
7670   scale_profile_for_vect_loop (loop, vf);
7671
7672   /* The minimum number of iterations performed by the epilogue.  This
7673      is 1 when peeling for gaps because we always need a final scalar
7674      iteration.  */
7675   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7676   /* +1 to convert latch counts to loop iteration counts,
7677      -min_epilogue_iters to remove iterations that cannot be performed
7678        by the vector code.  */
7679   int bias = 1 - min_epilogue_iters;
7680   /* In these calculations the "- 1" converts loop iteration counts
7681      back to latch counts.  */
7682   if (loop->any_upper_bound)
7683     loop->nb_iterations_upper_bound
7684       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7685   if (loop->any_likely_upper_bound)
7686     loop->nb_iterations_likely_upper_bound
7687       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7688   if (loop->any_estimate)
7689     loop->nb_iterations_estimate
7690       = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7691
7692   if (dump_enabled_p ())
7693     {
7694       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7695         {
7696           dump_printf_loc (MSG_NOTE, vect_location,
7697                            "LOOP VECTORIZED\n");
7698           if (loop->inner)
7699             dump_printf_loc (MSG_NOTE, vect_location,
7700                              "OUTER LOOP VECTORIZED\n");
7701           dump_printf (MSG_NOTE, "\n");
7702         }
7703       else
7704         dump_printf_loc (MSG_NOTE, vect_location,
7705                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7706                          current_vector_size);
7707     }
7708
7709   /* Free SLP instances here because otherwise stmt reference counting
7710      won't work.  */
7711   slp_instance instance;
7712   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7713     vect_free_slp_instance (instance);
7714   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7715   /* Clear-up safelen field since its value is invalid after vectorization
7716      since vectorized loop can have loop-carried dependencies.  */
7717   loop->safelen = 0;
7718
7719   /* Don't vectorize epilogue for epilogue.  */
7720   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7721     epilogue = NULL;
7722
7723   if (epilogue)
7724     {
7725         unsigned int vector_sizes
7726           = targetm.vectorize.autovectorize_vector_sizes ();
7727         vector_sizes &= current_vector_size - 1;
7728
7729         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7730           epilogue = NULL;
7731         else if (!vector_sizes)
7732           epilogue = NULL;
7733         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7734                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7735           {
7736             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7737             int ratio = current_vector_size / smallest_vec_size;
7738             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7739               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7740             eiters = eiters % vf;
7741
7742             epilogue->nb_iterations_upper_bound = eiters - 1;
7743
7744             if (eiters < vf / ratio)
7745               epilogue = NULL;
7746             }
7747     }
7748
7749   if (epilogue)
7750     {
7751       epilogue->force_vectorize = loop->force_vectorize;
7752       epilogue->safelen = loop->safelen;
7753       epilogue->dont_vectorize = false;
7754
7755       /* We may need to if-convert epilogue to vectorize it.  */
7756       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7757         tree_if_conversion (epilogue);
7758     }
7759
7760   return epilogue;
7761 }
7762
7763 /* The code below is trying to perform simple optimization - revert
7764    if-conversion for masked stores, i.e. if the mask of a store is zero
7765    do not perform it and all stored value producers also if possible.
7766    For example,
7767      for (i=0; i<n; i++)
7768        if (c[i])
7769         {
7770           p1[i] += 1;
7771           p2[i] = p3[i] +2;
7772         }
7773    this transformation will produce the following semi-hammock:
7774
7775    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7776      {
7777        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7778        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7779        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7780        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7781        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7782        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7783      }
7784 */
7785
7786 void
7787 optimize_mask_stores (struct loop *loop)
7788 {
7789   basic_block *bbs = get_loop_body (loop);
7790   unsigned nbbs = loop->num_nodes;
7791   unsigned i;
7792   basic_block bb;
7793   struct loop *bb_loop;
7794   gimple_stmt_iterator gsi;
7795   gimple *stmt;
7796   auto_vec<gimple *> worklist;
7797
7798   vect_location = find_loop_location (loop);
7799   /* Pick up all masked stores in loop if any.  */
7800   for (i = 0; i < nbbs; i++)
7801     {
7802       bb = bbs[i];
7803       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7804            gsi_next (&gsi))
7805         {
7806           stmt = gsi_stmt (gsi);
7807           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7808             worklist.safe_push (stmt);
7809         }
7810     }
7811
7812   free (bbs);
7813   if (worklist.is_empty ())
7814     return;
7815
7816   /* Loop has masked stores.  */
7817   while (!worklist.is_empty ())
7818     {
7819       gimple *last, *last_store;
7820       edge e, efalse;
7821       tree mask;
7822       basic_block store_bb, join_bb;
7823       gimple_stmt_iterator gsi_to;
7824       tree vdef, new_vdef;
7825       gphi *phi;
7826       tree vectype;
7827       tree zero;
7828
7829       last = worklist.pop ();
7830       mask = gimple_call_arg (last, 2);
7831       bb = gimple_bb (last);
7832       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7833          the same loop as if_bb.  It could be different to LOOP when two
7834          level loop-nest is vectorized and mask_store belongs to the inner
7835          one.  */
7836       e = split_block (bb, last);
7837       bb_loop = bb->loop_father;
7838       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7839       join_bb = e->dest;
7840       store_bb = create_empty_bb (bb);
7841       add_bb_to_loop (store_bb, bb_loop);
7842       e->flags = EDGE_TRUE_VALUE;
7843       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7844       /* Put STORE_BB to likely part.  */
7845       efalse->probability = profile_probability::unlikely ();
7846       store_bb->count = efalse->count ();
7847       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7848       if (dom_info_available_p (CDI_DOMINATORS))
7849         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7850       if (dump_enabled_p ())
7851         dump_printf_loc (MSG_NOTE, vect_location,
7852                          "Create new block %d to sink mask stores.",
7853                          store_bb->index);
7854       /* Create vector comparison with boolean result.  */
7855       vectype = TREE_TYPE (mask);
7856       zero = build_zero_cst (vectype);
7857       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7858       gsi = gsi_last_bb (bb);
7859       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7860       /* Create new PHI node for vdef of the last masked store:
7861          .MEM_2 = VDEF <.MEM_1>
7862          will be converted to
7863          .MEM.3 = VDEF <.MEM_1>
7864          and new PHI node will be created in join bb
7865          .MEM_2 = PHI <.MEM_1, .MEM_3>
7866       */
7867       vdef = gimple_vdef (last);
7868       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7869       gimple_set_vdef (last, new_vdef);
7870       phi = create_phi_node (vdef, join_bb);
7871       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7872
7873       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7874       while (true)
7875         {
7876           gimple_stmt_iterator gsi_from;
7877           gimple *stmt1 = NULL;
7878
7879           /* Move masked store to STORE_BB.  */
7880           last_store = last;
7881           gsi = gsi_for_stmt (last);
7882           gsi_from = gsi;
7883           /* Shift GSI to the previous stmt for further traversal.  */
7884           gsi_prev (&gsi);
7885           gsi_to = gsi_start_bb (store_bb);
7886           gsi_move_before (&gsi_from, &gsi_to);
7887           /* Setup GSI_TO to the non-empty block start.  */
7888           gsi_to = gsi_start_bb (store_bb);
7889           if (dump_enabled_p ())
7890             {
7891               dump_printf_loc (MSG_NOTE, vect_location,
7892                                "Move stmt to created bb\n");
7893               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7894             }
7895           /* Move all stored value producers if possible.  */
7896           while (!gsi_end_p (gsi))
7897             {
7898               tree lhs;
7899               imm_use_iterator imm_iter;
7900               use_operand_p use_p;
7901               bool res;
7902
7903               /* Skip debug statements.  */
7904               if (is_gimple_debug (gsi_stmt (gsi)))
7905                 {
7906                   gsi_prev (&gsi);
7907                   continue;
7908                 }
7909               stmt1 = gsi_stmt (gsi);
7910               /* Do not consider statements writing to memory or having
7911                  volatile operand.  */
7912               if (gimple_vdef (stmt1)
7913                   || gimple_has_volatile_ops (stmt1))
7914                 break;
7915               gsi_from = gsi;
7916               gsi_prev (&gsi);
7917               lhs = gimple_get_lhs (stmt1);
7918               if (!lhs)
7919                 break;
7920
7921               /* LHS of vectorized stmt must be SSA_NAME.  */
7922               if (TREE_CODE (lhs) != SSA_NAME)
7923                 break;
7924
7925               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7926                 {
7927                   /* Remove dead scalar statement.  */
7928                   if (has_zero_uses (lhs))
7929                     {
7930                       gsi_remove (&gsi_from, true);
7931                       continue;
7932                     }
7933                 }
7934
7935               /* Check that LHS does not have uses outside of STORE_BB.  */
7936               res = true;
7937               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7938                 {
7939                   gimple *use_stmt;
7940                   use_stmt = USE_STMT (use_p);
7941                   if (is_gimple_debug (use_stmt))
7942                     continue;
7943                   if (gimple_bb (use_stmt) != store_bb)
7944                     {
7945                       res = false;
7946                       break;
7947                     }
7948                 }
7949               if (!res)
7950                 break;
7951
7952               if (gimple_vuse (stmt1)
7953                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7954                 break;
7955
7956               /* Can move STMT1 to STORE_BB.  */
7957               if (dump_enabled_p ())
7958                 {
7959                   dump_printf_loc (MSG_NOTE, vect_location,
7960                                    "Move stmt to created bb\n");
7961                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7962                 }
7963               gsi_move_before (&gsi_from, &gsi_to);
7964               /* Shift GSI_TO for further insertion.  */
7965               gsi_prev (&gsi_to);
7966             }
7967           /* Put other masked stores with the same mask to STORE_BB.  */
7968           if (worklist.is_empty ()
7969               || gimple_call_arg (worklist.last (), 2) != mask
7970               || worklist.last () != stmt1)
7971             break;
7972           last = worklist.pop ();
7973         }
7974       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7975     }
7976 }