gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156
 157 /* Function vect_determine_vectorization_factor
 158
 159    Determine the vectorization factor (VF).  VF is the number of data elements
 160    that are operated upon in parallel in a single iteration of the vectorized
 161    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 162    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 163    elements can fit in a single vector register.
 164
 165    We currently support vectorization of loops in which all types operated upon
 166    are of the same size.  Therefore this function currently sets VF according to
 167    the size of the types operated upon, and fails if there are multiple sizes
 168    in the loop.
 169
 170    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 171    original loop:
 172         for (i=0; i<N; i++){
 173           a[i] = b[i] + c[i];
 174         }
 175
 176    vectorized loop:
 177         for (i=0; i<N; i+=VF){
 178           a[i:VF] = b[i:VF] + c[i:VF];
 179         }
 180 */
 181
 182 static bool
 183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 187   unsigned nbbs = loop->num_nodes;
 188   poly_uint64 vectorization_factor = 1;
 189   tree scalar_type = NULL_TREE;
 190   gphi *phi;
 191   tree vectype;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               if (dump_enabled_p ())
 261                 dump_printf_loc (MSG_NOTE, vect_location,
 262                                  "nunits = " HOST_WIDE_INT_PRINT_DEC "\n",
 263                                  TYPE_VECTOR_SUBPARTS (vectype));
 264
 265               vect_update_max_nunits (&vectorization_factor, vectype);
 266             }
 267         }
 268
 269       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 270            !gsi_end_p (si) || analyze_pattern_stmt;)
 271         {
 272           tree vf_vectype;
 273
 274           if (analyze_pattern_stmt)
 275             stmt = pattern_stmt;
 276           else
 277             stmt = gsi_stmt (si);
 278
 279           stmt_info = vinfo_for_stmt (stmt);
 280
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_NOTE, vect_location,
 284                                "==> examining statement: ");
 285               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                     }
 308                 }
 309               else
 310                 {
 311                   if (dump_enabled_p ())
 312                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 313                   gsi_next (&si);
 314                   continue;
 315                 }
 316             }
 317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 321             analyze_pattern_stmt = true;
 322
 323           /* If a pattern statement has def stmts, analyze them too.  */
 324           if (is_pattern_stmt_p (stmt_info))
 325             {
 326               if (pattern_def_seq == NULL)
 327                 {
 328                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 329                   pattern_def_si = gsi_start (pattern_def_seq);
 330                 }
 331               else if (!gsi_end_p (pattern_def_si))
 332                 gsi_next (&pattern_def_si);
 333               if (pattern_def_seq != NULL)
 334                 {
 335                   gimple *pattern_def_stmt = NULL;
 336                   stmt_vec_info pattern_def_stmt_info = NULL;
 337
 338                   while (!gsi_end_p (pattern_def_si))
 339                     {
 340                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 341                       pattern_def_stmt_info
 342                         = vinfo_for_stmt (pattern_def_stmt);
 343                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 344                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 345                         break;
 346                       gsi_next (&pattern_def_si);
 347                     }
 348
 349                   if (!gsi_end_p (pattern_def_si))
 350                     {
 351                       if (dump_enabled_p ())
 352                         {
 353                           dump_printf_loc (MSG_NOTE, vect_location,
 354                                            "==> examining pattern def stmt: ");
 355                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 356                                             pattern_def_stmt, 0);
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                 }
 398               return false;
 399             }
 400
 401           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 402             {
 403               if (dump_enabled_p ())
 404                 {
 405                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                                    "not vectorized: vector stmt in loop:");
 407                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 408                 }
 409               return false;
 410             }
 411
 412           bool_result = false;
 413
 414           if (STMT_VINFO_VECTYPE (stmt_info))
 415             {
 416               /* The only case when a vectype had been already set is for stmts
 417                  that contain a dataref, or for "pattern-stmts" (stmts
 418                  generated by the vectorizer to represent/replace a certain
 419                  idiom).  */
 420               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 421                           || is_pattern_stmt_p (stmt_info)
 422                           || !gsi_end_p (pattern_def_si));
 423               vectype = STMT_VINFO_VECTYPE (stmt_info);
 424             }
 425           else
 426             {
 427               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 428               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432
 433               /* Bool ops don't participate in vectorization factor
 434                  computation.  For comparison use compared types to
 435                  compute a factor.  */
 436               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 437                   && is_gimple_assign (stmt)
 438                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 439                 {
 440                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 441                       || STMT_VINFO_LIVE_P (stmt_info))
 442                     mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 446                       == tcc_comparison
 447                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 448                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 449                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 450                   else
 451                     {
 452                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 453                         {
 454                           pattern_def_seq = NULL;
 455                           gsi_next (&si);
 456                         }
 457                       continue;
 458                     }
 459                 }
 460
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               if (!bool_result)
 484                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 485
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 490                   dump_printf (MSG_NOTE, "\n");
 491                 }
 492             }
 493
 494           /* Don't try to compute VF out scalar types if we stmt
 495              produces boolean vector.  Use result vectype instead.  */
 496           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 497             vf_vectype = vectype;
 498           else
 499             {
 500               /* The vectorization factor is according to the smallest
 501                  scalar type (or the largest vector size, but we only
 502                  support one vector size per loop).  */
 503               if (!bool_result)
 504                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 505                                                              &dummy);
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_NOTE, vect_location,
 509                                    "get vectype for scalar type:  ");
 510                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 511                   dump_printf (MSG_NOTE, "\n");
 512                 }
 513               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 514             }
 515           if (!vf_vectype)
 516             {
 517               if (dump_enabled_p ())
 518                 {
 519                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                                    "not vectorized: unsupported data-type ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      scalar_type);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 529                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 530             {
 531               if (dump_enabled_p ())
 532                 {
 533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                                    "not vectorized: different sized vector "
 535                                    "types in statement, ");
 536                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 537                                      vectype);
 538                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vf_vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 542                 }
 543               return false;
 544             }
 545
 546           if (dump_enabled_p ())
 547             {
 548               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 549               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 550               dump_printf (MSG_NOTE, "\n");
 551             }
 552
 553           if (dump_enabled_p ())
 554             dump_printf_loc (MSG_NOTE, vect_location,
 555                              "nunits = " HOST_WIDE_INT_PRINT_DEC "\n",
 556                              TYPE_VECTOR_SUBPARTS (vf_vectype));
 557
 558           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 559
 560           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 561             {
 562               pattern_def_seq = NULL;
 563               gsi_next (&si);
 564             }
 565         }
 566     }
 567
 568   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 569   if (dump_enabled_p ())
 570     {
 571       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 572       dump_dec (MSG_NOTE, vectorization_factor);
 573       dump_printf (MSG_NOTE, "\n");
 574     }
 575
 576   if (known_le (vectorization_factor, 1U))
 577     {
 578       if (dump_enabled_p ())
 579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 580                          "not vectorized: unsupported data-type\n");
 581       return false;
 582     }
 583   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 584
 585   for (i = 0; i < mask_producers.length (); i++)
 586     {
 587       tree mask_type = NULL;
 588
 589       stmt = STMT_VINFO_STMT (mask_producers[i]);
 590
 591       if (is_gimple_assign (stmt)
 592           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 593           && !VECT_SCALAR_BOOLEAN_TYPE_P
 594                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 595         {
 596           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 597           mask_type = get_mask_type_for_scalar_type (scalar_type);
 598
 599           if (!mask_type)
 600             {
 601               if (dump_enabled_p ())
 602                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 603                                  "not vectorized: unsupported mask\n");
 604               return false;
 605             }
 606         }
 607       else
 608         {
 609           tree rhs;
 610           ssa_op_iter iter;
 611           gimple *def_stmt;
 612           enum vect_def_type dt;
 613
 614           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 615             {
 616               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 617                                        &def_stmt, &dt, &vectype))
 618                 {
 619                   if (dump_enabled_p ())
 620                     {
 621                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 622                                        "not vectorized: can't compute mask type "
 623                                        "for statement, ");
 624                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 625                                         0);
 626                     }
 627                   return false;
 628                 }
 629
 630               /* No vectype probably means external definition.
 631                  Allow it in case there is another operand which
 632                  allows to determine mask type.  */
 633               if (!vectype)
 634                 continue;
 635
 636               if (!mask_type)
 637                 mask_type = vectype;
 638               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 639                        != TYPE_VECTOR_SUBPARTS (vectype))
 640                 {
 641                   if (dump_enabled_p ())
 642                     {
 643                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 644                                        "not vectorized: different sized masks "
 645                                        "types in statement, ");
 646                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 647                                          mask_type);
 648                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 649                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 650                                          vectype);
 651                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 652                     }
 653                   return false;
 654                 }
 655               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 656                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 657                 {
 658                   if (dump_enabled_p ())
 659                     {
 660                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 661                                        "not vectorized: mixed mask and "
 662                                        "nonmask vector types in statement, ");
 663                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 664                                          mask_type);
 665                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 666                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 667                                          vectype);
 668                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 669                     }
 670                   return false;
 671                 }
 672             }
 673
 674           /* We may compare boolean value loaded as vector of integers.
 675              Fix mask_type in such case.  */
 676           if (mask_type
 677               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 678               && gimple_code (stmt) == GIMPLE_ASSIGN
 679               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 680             mask_type = build_same_sized_truth_vector_type (mask_type);
 681         }
 682
 683       /* No mask_type should mean loop invariant predicate.
 684          This is probably a subject for optimization in
 685          if-conversion.  */
 686       if (!mask_type)
 687         {
 688           if (dump_enabled_p ())
 689             {
 690               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 691                                "not vectorized: can't compute mask type "
 692                                "for statement, ");
 693               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 694                                 0);
 695             }
 696           return false;
 697         }
 698
 699       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 700     }
 701
 702   return true;
 703 }
 704
 705
 706 /* Function vect_is_simple_iv_evolution.
 707
 708    FORNOW: A simple evolution of an induction variables in the loop is
 709    considered a polynomial evolution.  */
 710
 711 static bool
 712 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 713                              tree * step)
 714 {
 715   tree init_expr;
 716   tree step_expr;
 717   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 718   basic_block bb;
 719
 720   /* When there is no evolution in this loop, the evolution function
 721      is not "simple".  */
 722   if (evolution_part == NULL_TREE)
 723     return false;
 724
 725   /* When the evolution is a polynomial of degree >= 2
 726      the evolution function is not "simple".  */
 727   if (tree_is_chrec (evolution_part))
 728     return false;
 729
 730   step_expr = evolution_part;
 731   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 732
 733   if (dump_enabled_p ())
 734     {
 735       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 736       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 737       dump_printf (MSG_NOTE, ",  init: ");
 738       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 739       dump_printf (MSG_NOTE, "\n");
 740     }
 741
 742   *init = init_expr;
 743   *step = step_expr;
 744
 745   if (TREE_CODE (step_expr) != INTEGER_CST
 746       && (TREE_CODE (step_expr) != SSA_NAME
 747           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 748               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 749           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 750               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 751                   || !flag_associative_math)))
 752       && (TREE_CODE (step_expr) != REAL_CST
 753           || !flag_associative_math))
 754     {
 755       if (dump_enabled_p ())
 756         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 757                          "step unknown.\n");
 758       return false;
 759     }
 760
 761   return true;
 762 }
 763
 764 /* Function vect_analyze_scalar_cycles_1.
 765
 766    Examine the cross iteration def-use cycles of scalar variables
 767    in LOOP.  LOOP_VINFO represents the loop that is now being
 768    considered for vectorization (can be LOOP, or an outer-loop
 769    enclosing LOOP).  */
 770
 771 static void
 772 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 773 {
 774   basic_block bb = loop->header;
 775   tree init, step;
 776   auto_vec<gimple *, 64> worklist;
 777   gphi_iterator gsi;
 778   bool double_reduc;
 779
 780   if (dump_enabled_p ())
 781     dump_printf_loc (MSG_NOTE, vect_location,
 782                      "=== vect_analyze_scalar_cycles ===\n");
 783
 784   /* First - identify all inductions.  Reduction detection assumes that all the
 785      inductions have been identified, therefore, this order must not be
 786      changed.  */
 787   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 788     {
 789       gphi *phi = gsi.phi ();
 790       tree access_fn = NULL;
 791       tree def = PHI_RESULT (phi);
 792       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 793
 794       if (dump_enabled_p ())
 795         {
 796           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 797           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 798         }
 799
 800       /* Skip virtual phi's.  The data dependences that are associated with
 801          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 802       if (virtual_operand_p (def))
 803         continue;
 804
 805       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 806
 807       /* Analyze the evolution function.  */
 808       access_fn = analyze_scalar_evolution (loop, def);
 809       if (access_fn)
 810         {
 811           STRIP_NOPS (access_fn);
 812           if (dump_enabled_p ())
 813             {
 814               dump_printf_loc (MSG_NOTE, vect_location,
 815                                "Access function of PHI: ");
 816               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 817               dump_printf (MSG_NOTE, "\n");
 818             }
 819           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 820             = initial_condition_in_loop_num (access_fn, loop->num);
 821           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 822             = evolution_part_in_loop_num (access_fn, loop->num);
 823         }
 824
 825       if (!access_fn
 826           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 827           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 828               && TREE_CODE (step) != INTEGER_CST))
 829         {
 830           worklist.safe_push (phi);
 831           continue;
 832         }
 833
 834       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 835                   != NULL_TREE);
 836       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 837
 838       if (dump_enabled_p ())
 839         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 840       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 841     }
 842
 843
 844   /* Second - identify all reductions and nested cycles.  */
 845   while (worklist.length () > 0)
 846     {
 847       gimple *phi = worklist.pop ();
 848       tree def = PHI_RESULT (phi);
 849       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 850       gimple *reduc_stmt;
 851
 852       if (dump_enabled_p ())
 853         {
 854           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 855           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 856         }
 857
 858       gcc_assert (!virtual_operand_p (def)
 859                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 860
 861       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 862                                                 &double_reduc, false);
 863       if (reduc_stmt)
 864         {
 865           if (double_reduc)
 866             {
 867               if (dump_enabled_p ())
 868                 dump_printf_loc (MSG_NOTE, vect_location,
 869                                  "Detected double reduction.\n");
 870
 871               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 872               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 873                                                     vect_double_reduction_def;
 874             }
 875           else
 876             {
 877               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 878                 {
 879                   if (dump_enabled_p ())
 880                     dump_printf_loc (MSG_NOTE, vect_location,
 881                                      "Detected vectorizable nested cycle.\n");
 882
 883                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 884                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 885                                                              vect_nested_cycle;
 886                 }
 887               else
 888                 {
 889                   if (dump_enabled_p ())
 890                     dump_printf_loc (MSG_NOTE, vect_location,
 891                                      "Detected reduction.\n");
 892
 893                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 894                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 895                                                            vect_reduction_def;
 896                   /* Store the reduction cycles for possible vectorization in
 897                      loop-aware SLP if it was not detected as reduction
 898                      chain.  */
 899                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 900                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 901                 }
 902             }
 903         }
 904       else
 905         if (dump_enabled_p ())
 906           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 907                            "Unknown def-use cycle pattern.\n");
 908     }
 909 }
 910
 911
 912 /* Function vect_analyze_scalar_cycles.
 913
 914    Examine the cross iteration def-use cycles of scalar variables, by
 915    analyzing the loop-header PHIs of scalar variables.  Classify each
 916    cycle as one of the following: invariant, induction, reduction, unknown.
 917    We do that for the loop represented by LOOP_VINFO, and also to its
 918    inner-loop, if exists.
 919    Examples for scalar cycles:
 920
 921    Example1: reduction:
 922
 923               loop1:
 924               for (i=0; i<N; i++)
 925                  sum += a[i];
 926
 927    Example2: induction:
 928
 929               loop2:
 930               for (i=0; i<N; i++)
 931                  a[i] = i;  */
 932
 933 static void
 934 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 935 {
 936   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 937
 938   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 939
 940   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 941      Reductions in such inner-loop therefore have different properties than
 942      the reductions in the nest that gets vectorized:
 943      1. When vectorized, they are executed in the same order as in the original
 944         scalar loop, so we can't change the order of computation when
 945         vectorizing them.
 946      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 947         current checks are too strict.  */
 948
 949   if (loop->inner)
 950     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 951 }
 952
 953 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 954
 955 static void
 956 vect_fixup_reduc_chain (gimple *stmt)
 957 {
 958   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 959   gimple *stmtp;
 960   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 961               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 962   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 963   do
 964     {
 965       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 966       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 967       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 968       if (stmt)
 969         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 970           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 971     }
 972   while (stmt);
 973   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 974 }
 975
 976 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 977
 978 static void
 979 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 980 {
 981   gimple *first;
 982   unsigned i;
 983
 984   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 985     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 986       {
 987         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 988         while (next)
 989           {
 990             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 991               break;
 992             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 993           }
 994         /* If not all stmt in the chain are patterns try to handle
 995            the chain without patterns.  */
 996         if (! next)
 997           {
 998             vect_fixup_reduc_chain (first);
 999             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1000               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1001           }
1002       }
1003 }
1004
1005 /* Function vect_get_loop_niters.
1006
1007    Determine how many iterations the loop is executed and place it
1008    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1009    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1010    niter information holds in ASSUMPTIONS.
1011
1012    Return the loop exit condition.  */
1013
1014
1015 static gcond *
1016 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1017                       tree *number_of_iterations, tree *number_of_iterationsm1)
1018 {
1019   edge exit = single_exit (loop);
1020   struct tree_niter_desc niter_desc;
1021   tree niter_assumptions, niter, may_be_zero;
1022   gcond *cond = get_loop_exit_condition (loop);
1023
1024   *assumptions = boolean_true_node;
1025   *number_of_iterationsm1 = chrec_dont_know;
1026   *number_of_iterations = chrec_dont_know;
1027   if (dump_enabled_p ())
1028     dump_printf_loc (MSG_NOTE, vect_location,
1029                      "=== get_loop_niters ===\n");
1030
1031   if (!exit)
1032     return cond;
1033
1034   niter = chrec_dont_know;
1035   may_be_zero = NULL_TREE;
1036   niter_assumptions = boolean_true_node;
1037   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1038       || chrec_contains_undetermined (niter_desc.niter))
1039     return cond;
1040
1041   niter_assumptions = niter_desc.assumptions;
1042   may_be_zero = niter_desc.may_be_zero;
1043   niter = niter_desc.niter;
1044
1045   if (may_be_zero && integer_zerop (may_be_zero))
1046     may_be_zero = NULL_TREE;
1047
1048   if (may_be_zero)
1049     {
1050       if (COMPARISON_CLASS_P (may_be_zero))
1051         {
1052           /* Try to combine may_be_zero with assumptions, this can simplify
1053              computation of niter expression.  */
1054           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1055             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1056                                              niter_assumptions,
1057                                              fold_build1 (TRUTH_NOT_EXPR,
1058                                                           boolean_type_node,
1059                                                           may_be_zero));
1060           else
1061             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1062                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1063
1064           may_be_zero = NULL_TREE;
1065         }
1066       else if (integer_nonzerop (may_be_zero))
1067         {
1068           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1069           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1070           return cond;
1071         }
1072       else
1073         return cond;
1074     }
1075
1076   *assumptions = niter_assumptions;
1077   *number_of_iterationsm1 = niter;
1078
1079   /* We want the number of loop header executions which is the number
1080      of latch executions plus one.
1081      ???  For UINT_MAX latch executions this number overflows to zero
1082      for loops like do { n++; } while (n != 0);  */
1083   if (niter && !chrec_contains_undetermined (niter))
1084     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1085                           build_int_cst (TREE_TYPE (niter), 1));
1086   *number_of_iterations = niter;
1087
1088   return cond;
1089 }
1090
1091 /* Function bb_in_loop_p
1092
1093    Used as predicate for dfs order traversal of the loop bbs.  */
1094
1095 static bool
1096 bb_in_loop_p (const_basic_block bb, const void *data)
1097 {
1098   const struct loop *const loop = (const struct loop *)data;
1099   if (flow_bb_inside_loop_p (loop, bb))
1100     return true;
1101   return false;
1102 }
1103
1104
1105 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1106    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1107
1108 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1109   : vec_info (vec_info::loop, init_cost (loop_in)),
1110     loop (loop_in),
1111     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1112     num_itersm1 (NULL_TREE),
1113     num_iters (NULL_TREE),
1114     num_iters_unchanged (NULL_TREE),
1115     num_iters_assumptions (NULL_TREE),
1116     th (0),
1117     versioning_threshold (0),
1118     vectorization_factor (0),
1119     max_vectorization_factor (0),
1120     unaligned_dr (NULL),
1121     peeling_for_alignment (0),
1122     ptr_mask (0),
1123     slp_unrolling_factor (1),
1124     single_scalar_iteration_cost (0),
1125     vectorizable (false),
1126     peeling_for_gaps (false),
1127     peeling_for_niter (false),
1128     operands_swapped (false),
1129     no_data_dependencies (false),
1130     has_mask_store (false),
1131     scalar_loop (NULL),
1132     orig_loop_info (NULL)
1133 {
1134   /* Create/Update stmt_info for all stmts in the loop.  */
1135   basic_block *body = get_loop_body (loop);
1136   for (unsigned int i = 0; i < loop->num_nodes; i++)
1137     {
1138       basic_block bb = body[i];
1139       gimple_stmt_iterator si;
1140
1141       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1142         {
1143           gimple *phi = gsi_stmt (si);
1144           gimple_set_uid (phi, 0);
1145           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1146         }
1147
1148       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1149         {
1150           gimple *stmt = gsi_stmt (si);
1151           gimple_set_uid (stmt, 0);
1152           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1153         }
1154     }
1155   free (body);
1156
1157   /* CHECKME: We want to visit all BBs before their successors (except for
1158      latch blocks, for which this assertion wouldn't hold).  In the simple
1159      case of the loop forms we allow, a dfs order of the BBs would the same
1160      as reversed postorder traversal, so we are safe.  */
1161
1162   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1163                                           bbs, loop->num_nodes, loop);
1164   gcc_assert (nbbs == loop->num_nodes);
1165 }
1166
1167
1168 /* Free all memory used by the _loop_vec_info, as well as all the
1169    stmt_vec_info structs of all the stmts in the loop.  */
1170
1171 _loop_vec_info::~_loop_vec_info ()
1172 {
1173   int nbbs;
1174   gimple_stmt_iterator si;
1175   int j;
1176
1177   nbbs = loop->num_nodes;
1178   for (j = 0; j < nbbs; j++)
1179     {
1180       basic_block bb = bbs[j];
1181       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1182         free_stmt_vec_info (gsi_stmt (si));
1183
1184       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1185         {
1186           gimple *stmt = gsi_stmt (si);
1187
1188           /* We may have broken canonical form by moving a constant
1189              into RHS1 of a commutative op.  Fix such occurrences.  */
1190           if (operands_swapped && is_gimple_assign (stmt))
1191             {
1192               enum tree_code code = gimple_assign_rhs_code (stmt);
1193
1194               if ((code == PLUS_EXPR
1195                    || code == POINTER_PLUS_EXPR
1196                    || code == MULT_EXPR)
1197                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1198                 swap_ssa_operands (stmt,
1199                                    gimple_assign_rhs1_ptr (stmt),
1200                                    gimple_assign_rhs2_ptr (stmt));
1201               else if (code == COND_EXPR
1202                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1203                 {
1204                   tree cond_expr = gimple_assign_rhs1 (stmt);
1205                   enum tree_code cond_code = TREE_CODE (cond_expr);
1206
1207                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1208                     {
1209                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1210                                                                   0));
1211                       cond_code = invert_tree_comparison (cond_code,
1212                                                           honor_nans);
1213                       if (cond_code != ERROR_MARK)
1214                         {
1215                           TREE_SET_CODE (cond_expr, cond_code);
1216                           swap_ssa_operands (stmt,
1217                                              gimple_assign_rhs2_ptr (stmt),
1218                                              gimple_assign_rhs3_ptr (stmt));
1219                         }
1220                     }
1221                 }
1222             }
1223
1224           /* Free stmt_vec_info.  */
1225           free_stmt_vec_info (stmt);
1226           gsi_next (&si);
1227         }
1228     }
1229
1230   free (bbs);
1231
1232   loop->aux = NULL;
1233 }
1234
1235
1236 /* Calculate the cost of one scalar iteration of the loop.  */
1237 static void
1238 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1239 {
1240   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1241   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1242   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1243   int innerloop_iters, i;
1244
1245   /* Count statements in scalar loop.  Using this as scalar cost for a single
1246      iteration for now.
1247
1248      TODO: Add outer loop support.
1249
1250      TODO: Consider assigning different costs to different scalar
1251      statements.  */
1252
1253   /* FORNOW.  */
1254   innerloop_iters = 1;
1255   if (loop->inner)
1256     innerloop_iters = 50; /* FIXME */
1257
1258   for (i = 0; i < nbbs; i++)
1259     {
1260       gimple_stmt_iterator si;
1261       basic_block bb = bbs[i];
1262
1263       if (bb->loop_father == loop->inner)
1264         factor = innerloop_iters;
1265       else
1266         factor = 1;
1267
1268       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1269         {
1270           gimple *stmt = gsi_stmt (si);
1271           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1272
1273           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1274             continue;
1275
1276           /* Skip stmts that are not vectorized inside the loop.  */
1277           if (stmt_info
1278               && !STMT_VINFO_RELEVANT_P (stmt_info)
1279               && (!STMT_VINFO_LIVE_P (stmt_info)
1280                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1281               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1282             continue;
1283
1284           vect_cost_for_stmt kind;
1285           if (STMT_VINFO_DATA_REF (stmt_info))
1286             {
1287               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1288                kind = scalar_load;
1289              else
1290                kind = scalar_store;
1291             }
1292           else
1293             kind = scalar_stmt;
1294
1295           scalar_single_iter_cost
1296             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1297                                  factor, kind, stmt_info, 0, vect_prologue);
1298         }
1299     }
1300   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1301     = scalar_single_iter_cost;
1302 }
1303
1304
1305 /* Function vect_analyze_loop_form_1.
1306
1307    Verify that certain CFG restrictions hold, including:
1308    - the loop has a pre-header
1309    - the loop has a single entry and exit
1310    - the loop exit condition is simple enough
1311    - the number of iterations can be analyzed, i.e, a countable loop.  The
1312      niter could be analyzed under some assumptions.  */
1313
1314 bool
1315 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1316                           tree *assumptions, tree *number_of_iterationsm1,
1317                           tree *number_of_iterations, gcond **inner_loop_cond)
1318 {
1319   if (dump_enabled_p ())
1320     dump_printf_loc (MSG_NOTE, vect_location,
1321                      "=== vect_analyze_loop_form ===\n");
1322
1323   /* Different restrictions apply when we are considering an inner-most loop,
1324      vs. an outer (nested) loop.
1325      (FORNOW. May want to relax some of these restrictions in the future).  */
1326
1327   if (!loop->inner)
1328     {
1329       /* Inner-most loop.  We currently require that the number of BBs is
1330          exactly 2 (the header and latch).  Vectorizable inner-most loops
1331          look like this:
1332
1333                         (pre-header)
1334                            |
1335                           header <--------+
1336                            | |            |
1337                            | +--> latch --+
1338                            |
1339                         (exit-bb)  */
1340
1341       if (loop->num_nodes != 2)
1342         {
1343           if (dump_enabled_p ())
1344             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                              "not vectorized: control flow in loop.\n");
1346           return false;
1347         }
1348
1349       if (empty_block_p (loop->header))
1350         {
1351           if (dump_enabled_p ())
1352             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1353                              "not vectorized: empty loop.\n");
1354           return false;
1355         }
1356     }
1357   else
1358     {
1359       struct loop *innerloop = loop->inner;
1360       edge entryedge;
1361
1362       /* Nested loop. We currently require that the loop is doubly-nested,
1363          contains a single inner loop, and the number of BBs is exactly 5.
1364          Vectorizable outer-loops look like this:
1365
1366                         (pre-header)
1367                            |
1368                           header <---+
1369                            |         |
1370                           inner-loop |
1371                            |         |
1372                           tail ------+
1373                            |
1374                         (exit-bb)
1375
1376          The inner-loop has the properties expected of inner-most loops
1377          as described above.  */
1378
1379       if ((loop->inner)->inner || (loop->inner)->next)
1380         {
1381           if (dump_enabled_p ())
1382             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383                              "not vectorized: multiple nested loops.\n");
1384           return false;
1385         }
1386
1387       if (loop->num_nodes != 5)
1388         {
1389           if (dump_enabled_p ())
1390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391                              "not vectorized: control flow in loop.\n");
1392           return false;
1393         }
1394
1395       entryedge = loop_preheader_edge (innerloop);
1396       if (entryedge->src != loop->header
1397           || !single_exit (innerloop)
1398           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1399         {
1400           if (dump_enabled_p ())
1401             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402                              "not vectorized: unsupported outerloop form.\n");
1403           return false;
1404         }
1405
1406       /* Analyze the inner-loop.  */
1407       tree inner_niterm1, inner_niter, inner_assumptions;
1408       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1409                                       &inner_assumptions, &inner_niterm1,
1410                                       &inner_niter, NULL)
1411           /* Don't support analyzing niter under assumptions for inner
1412              loop.  */
1413           || !integer_onep (inner_assumptions))
1414         {
1415           if (dump_enabled_p ())
1416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417                              "not vectorized: Bad inner loop.\n");
1418           return false;
1419         }
1420
1421       if (!expr_invariant_in_loop_p (loop, inner_niter))
1422         {
1423           if (dump_enabled_p ())
1424             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1425                              "not vectorized: inner-loop count not"
1426                              " invariant.\n");
1427           return false;
1428         }
1429
1430       if (dump_enabled_p ())
1431         dump_printf_loc (MSG_NOTE, vect_location,
1432                          "Considering outer-loop vectorization.\n");
1433     }
1434
1435   if (!single_exit (loop)
1436       || EDGE_COUNT (loop->header->preds) != 2)
1437     {
1438       if (dump_enabled_p ())
1439         {
1440           if (!single_exit (loop))
1441             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442                              "not vectorized: multiple exits.\n");
1443           else if (EDGE_COUNT (loop->header->preds) != 2)
1444             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1445                              "not vectorized: too many incoming edges.\n");
1446         }
1447       return false;
1448     }
1449
1450   /* We assume that the loop exit condition is at the end of the loop. i.e,
1451      that the loop is represented as a do-while (with a proper if-guard
1452      before the loop if needed), where the loop header contains all the
1453      executable statements, and the latch is empty.  */
1454   if (!empty_block_p (loop->latch)
1455       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456     {
1457       if (dump_enabled_p ())
1458         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                          "not vectorized: latch block not empty.\n");
1460       return false;
1461     }
1462
1463   /* Make sure the exit is not abnormal.  */
1464   edge e = single_exit (loop);
1465   if (e->flags & EDGE_ABNORMAL)
1466     {
1467       if (dump_enabled_p ())
1468         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469                          "not vectorized: abnormal loop exit edge.\n");
1470       return false;
1471     }
1472
1473   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1474                                      number_of_iterationsm1);
1475   if (!*loop_cond)
1476     {
1477       if (dump_enabled_p ())
1478         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1479                          "not vectorized: complicated exit condition.\n");
1480       return false;
1481     }
1482
1483   if (integer_zerop (*assumptions)
1484       || !*number_of_iterations
1485       || chrec_contains_undetermined (*number_of_iterations))
1486     {
1487       if (dump_enabled_p ())
1488         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489                          "not vectorized: number of iterations cannot be "
1490                          "computed.\n");
1491       return false;
1492     }
1493
1494   if (integer_zerop (*number_of_iterations))
1495     {
1496       if (dump_enabled_p ())
1497         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1498                          "not vectorized: number of iterations = 0.\n");
1499       return false;
1500     }
1501
1502   return true;
1503 }
1504
1505 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1506
1507 loop_vec_info
1508 vect_analyze_loop_form (struct loop *loop)
1509 {
1510   tree assumptions, number_of_iterations, number_of_iterationsm1;
1511   gcond *loop_cond, *inner_loop_cond = NULL;
1512
1513   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1514                                   &assumptions, &number_of_iterationsm1,
1515                                   &number_of_iterations, &inner_loop_cond))
1516     return NULL;
1517
1518   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1519   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1520   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1521   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1522   if (!integer_onep (assumptions))
1523     {
1524       /* We consider to vectorize this loop by versioning it under
1525          some assumptions.  In order to do this, we need to clear
1526          existing information computed by scev and niter analyzer.  */
1527       scev_reset_htab ();
1528       free_numbers_of_iterations_estimates (loop);
1529       /* Also set flag for this loop so that following scev and niter
1530          analysis are done under the assumptions.  */
1531       loop_constraint_set (loop, LOOP_C_FINITE);
1532       /* Also record the assumptions for versioning.  */
1533       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1534     }
1535
1536   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1537     {
1538       if (dump_enabled_p ())
1539         {
1540           dump_printf_loc (MSG_NOTE, vect_location,
1541                            "Symbolic number of iterations is ");
1542           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1543           dump_printf (MSG_NOTE, "\n");
1544         }
1545     }
1546
1547   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1548   if (inner_loop_cond)
1549     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1550       = loop_exit_ctrl_vec_info_type;
1551
1552   gcc_assert (!loop->aux);
1553   loop->aux = loop_vinfo;
1554   return loop_vinfo;
1555 }
1556
1557
1558
1559 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1560    statements update the vectorization factor.  */
1561
1562 static void
1563 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1564 {
1565   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1567   int nbbs = loop->num_nodes;
1568   poly_uint64 vectorization_factor;
1569   int i;
1570
1571   if (dump_enabled_p ())
1572     dump_printf_loc (MSG_NOTE, vect_location,
1573                      "=== vect_update_vf_for_slp ===\n");
1574
1575   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1576   gcc_assert (known_ne (vectorization_factor, 0U));
1577
1578   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1579      vectorization factor of the loop is the unrolling factor required by
1580      the SLP instances.  If that unrolling factor is 1, we say, that we
1581      perform pure SLP on loop - cross iteration parallelism is not
1582      exploited.  */
1583   bool only_slp_in_loop = true;
1584   for (i = 0; i < nbbs; i++)
1585     {
1586       basic_block bb = bbs[i];
1587       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1588            gsi_next (&si))
1589         {
1590           gimple *stmt = gsi_stmt (si);
1591           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1592           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1593               && STMT_VINFO_RELATED_STMT (stmt_info))
1594             {
1595               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1596               stmt_info = vinfo_for_stmt (stmt);
1597             }
1598           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1599                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1600               && !PURE_SLP_STMT (stmt_info))
1601             /* STMT needs both SLP and loop-based vectorization.  */
1602             only_slp_in_loop = false;
1603         }
1604     }
1605
1606   if (only_slp_in_loop)
1607     {
1608       dump_printf_loc (MSG_NOTE, vect_location,
1609                        "Loop contains only SLP stmts\n");
1610       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1611     }
1612   else
1613     {
1614       dump_printf_loc (MSG_NOTE, vect_location,
1615                        "Loop contains SLP and non-SLP stmts\n");
1616       /* Both the vectorization factor and unroll factor have the form
1617          current_vector_size * X for some rational X, so they must have
1618          a common multiple.  */
1619       vectorization_factor
1620         = force_common_multiple (vectorization_factor,
1621                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1622     }
1623
1624   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1625   if (dump_enabled_p ())
1626     {
1627       dump_printf_loc (MSG_NOTE, vect_location,
1628                        "Updating vectorization factor to ");
1629       dump_dec (MSG_NOTE, vectorization_factor);
1630       dump_printf (MSG_NOTE, ".\n");
1631     }
1632 }
1633
1634 /* Function vect_analyze_loop_operations.
1635
1636    Scan the loop stmts and make sure they are all vectorizable.  */
1637
1638 static bool
1639 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1640 {
1641   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1642   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1643   int nbbs = loop->num_nodes;
1644   int i;
1645   stmt_vec_info stmt_info;
1646   bool need_to_vectorize = false;
1647   bool ok;
1648
1649   if (dump_enabled_p ())
1650     dump_printf_loc (MSG_NOTE, vect_location,
1651                      "=== vect_analyze_loop_operations ===\n");
1652
1653   for (i = 0; i < nbbs; i++)
1654     {
1655       basic_block bb = bbs[i];
1656
1657       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1658            gsi_next (&si))
1659         {
1660           gphi *phi = si.phi ();
1661           ok = true;
1662
1663           stmt_info = vinfo_for_stmt (phi);
1664           if (dump_enabled_p ())
1665             {
1666               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1667               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1668             }
1669           if (virtual_operand_p (gimple_phi_result (phi)))
1670             continue;
1671
1672           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1673              (i.e., a phi in the tail of the outer-loop).  */
1674           if (! is_loop_header_bb_p (bb))
1675             {
1676               /* FORNOW: we currently don't support the case that these phis
1677                  are not used in the outerloop (unless it is double reduction,
1678                  i.e., this phi is vect_reduction_def), cause this case
1679                  requires to actually do something here.  */
1680               if (STMT_VINFO_LIVE_P (stmt_info)
1681                   && STMT_VINFO_DEF_TYPE (stmt_info)
1682                      != vect_double_reduction_def)
1683                 {
1684                   if (dump_enabled_p ())
1685                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                                      "Unsupported loop-closed phi in "
1687                                      "outer-loop.\n");
1688                   return false;
1689                 }
1690
1691               /* If PHI is used in the outer loop, we check that its operand
1692                  is defined in the inner loop.  */
1693               if (STMT_VINFO_RELEVANT_P (stmt_info))
1694                 {
1695                   tree phi_op;
1696                   gimple *op_def_stmt;
1697
1698                   if (gimple_phi_num_args (phi) != 1)
1699                     return false;
1700
1701                   phi_op = PHI_ARG_DEF (phi, 0);
1702                   if (TREE_CODE (phi_op) != SSA_NAME)
1703                     return false;
1704
1705                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1706                   if (gimple_nop_p (op_def_stmt)
1707                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1708                       || !vinfo_for_stmt (op_def_stmt))
1709                     return false;
1710
1711                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1712                         != vect_used_in_outer
1713                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1714                            != vect_used_in_outer_by_reduction)
1715                     return false;
1716                 }
1717
1718               continue;
1719             }
1720
1721           gcc_assert (stmt_info);
1722
1723           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1724                || STMT_VINFO_LIVE_P (stmt_info))
1725               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1726             {
1727               /* A scalar-dependence cycle that we don't support.  */
1728               if (dump_enabled_p ())
1729                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                                  "not vectorized: scalar dependence cycle.\n");
1731               return false;
1732             }
1733
1734           if (STMT_VINFO_RELEVANT_P (stmt_info))
1735             {
1736               need_to_vectorize = true;
1737               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1738                   && ! PURE_SLP_STMT (stmt_info))
1739                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1740               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1741                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1742                        && ! PURE_SLP_STMT (stmt_info))
1743                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1744             }
1745
1746           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1747             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1748
1749           if (!ok)
1750             {
1751               if (dump_enabled_p ())
1752                 {
1753                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1754                                    "not vectorized: relevant phi not "
1755                                    "supported: ");
1756                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1757                 }
1758               return false;
1759             }
1760         }
1761
1762       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1763            gsi_next (&si))
1764         {
1765           gimple *stmt = gsi_stmt (si);
1766           if (!gimple_clobber_p (stmt)
1767               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1768             return false;
1769         }
1770     } /* bbs */
1771
1772   /* All operations in the loop are either irrelevant (deal with loop
1773      control, or dead), or only used outside the loop and can be moved
1774      out of the loop (e.g. invariants, inductions).  The loop can be
1775      optimized away by scalar optimizations.  We're better off not
1776      touching this loop.  */
1777   if (!need_to_vectorize)
1778     {
1779       if (dump_enabled_p ())
1780         dump_printf_loc (MSG_NOTE, vect_location,
1781                          "All the computation can be taken out of the loop.\n");
1782       if (dump_enabled_p ())
1783         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1784                          "not vectorized: redundant loop. no profit to "
1785                          "vectorize.\n");
1786       return false;
1787     }
1788
1789   return true;
1790 }
1791
1792
1793 /* Function vect_analyze_loop_2.
1794
1795    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1796    for it.  The different analyses will record information in the
1797    loop_vec_info struct.  */
1798 static bool
1799 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1800 {
1801   bool ok;
1802   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1803   poly_uint64 min_vf = 2;
1804   unsigned int n_stmts = 0;
1805
1806   /* The first group of checks is independent of the vector size.  */
1807   fatal = true;
1808
1809   /* Find all data references in the loop (which correspond to vdefs/vuses)
1810      and analyze their evolution in the loop.  */
1811
1812   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1813
1814   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1815   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1816     {
1817       if (dump_enabled_p ())
1818         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1819                          "not vectorized: loop nest containing two "
1820                          "or more consecutive inner loops cannot be "
1821                          "vectorized\n");
1822       return false;
1823     }
1824
1825   for (unsigned i = 0; i < loop->num_nodes; i++)
1826     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1827          !gsi_end_p (gsi); gsi_next (&gsi))
1828       {
1829         gimple *stmt = gsi_stmt (gsi);
1830         if (is_gimple_debug (stmt))
1831           continue;
1832         ++n_stmts;
1833         if (!find_data_references_in_stmt (loop, stmt,
1834                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1835           {
1836             if (is_gimple_call (stmt) && loop->safelen)
1837               {
1838                 tree fndecl = gimple_call_fndecl (stmt), op;
1839                 if (fndecl != NULL_TREE)
1840                   {
1841                     cgraph_node *node = cgraph_node::get (fndecl);
1842                     if (node != NULL && node->simd_clones != NULL)
1843                       {
1844                         unsigned int j, n = gimple_call_num_args (stmt);
1845                         for (j = 0; j < n; j++)
1846                           {
1847                             op = gimple_call_arg (stmt, j);
1848                             if (DECL_P (op)
1849                                 || (REFERENCE_CLASS_P (op)
1850                                     && get_base_address (op)))
1851                               break;
1852                           }
1853                         op = gimple_call_lhs (stmt);
1854                         /* Ignore #pragma omp declare simd functions
1855                            if they don't have data references in the
1856                            call stmt itself.  */
1857                         if (j == n
1858                             && !(op
1859                                  && (DECL_P (op)
1860                                      || (REFERENCE_CLASS_P (op)
1861                                          && get_base_address (op)))))
1862                           continue;
1863                       }
1864                   }
1865               }
1866             if (dump_enabled_p ())
1867               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868                                "not vectorized: loop contains function "
1869                                "calls or data references that cannot "
1870                                "be analyzed\n");
1871             return false;
1872           }
1873       }
1874
1875   /* Analyze the data references and also adjust the minimal
1876      vectorization factor according to the loads and stores.  */
1877
1878   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1879   if (!ok)
1880     {
1881       if (dump_enabled_p ())
1882         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1883                          "bad data references.\n");
1884       return false;
1885     }
1886
1887   /* Classify all cross-iteration scalar data-flow cycles.
1888      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1889   vect_analyze_scalar_cycles (loop_vinfo);
1890
1891   vect_pattern_recog (loop_vinfo);
1892
1893   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1894
1895   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1896      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1897
1898   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1899   if (!ok)
1900     {
1901       if (dump_enabled_p ())
1902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903                          "bad data access.\n");
1904       return false;
1905     }
1906
1907   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1908
1909   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1910   if (!ok)
1911     {
1912       if (dump_enabled_p ())
1913         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914                          "unexpected pattern.\n");
1915       return false;
1916     }
1917
1918   /* While the rest of the analysis below depends on it in some way.  */
1919   fatal = false;
1920
1921   /* Analyze data dependences between the data-refs in the loop
1922      and adjust the maximum vectorization factor according to
1923      the dependences.
1924      FORNOW: fail at the first data dependence that we encounter.  */
1925
1926   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1927   if (!ok
1928       || (max_vf != MAX_VECTORIZATION_FACTOR
1929           && maybe_lt (max_vf, min_vf)))
1930     {
1931       if (dump_enabled_p ())
1932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1933                              "bad data dependence.\n");
1934       return false;
1935     }
1936   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1937
1938   ok = vect_determine_vectorization_factor (loop_vinfo);
1939   if (!ok)
1940     {
1941       if (dump_enabled_p ())
1942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943                          "can't determine vectorization factor.\n");
1944       return false;
1945     }
1946   if (max_vf != MAX_VECTORIZATION_FACTOR
1947       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1948     {
1949       if (dump_enabled_p ())
1950         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1951                          "bad data dependence.\n");
1952       return false;
1953     }
1954
1955   /* Compute the scalar iteration cost.  */
1956   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1957
1958   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959   HOST_WIDE_INT estimated_niter;
1960   unsigned th;
1961   int min_scalar_loop_bound;
1962
1963   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1964   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1965   if (!ok)
1966     return false;
1967
1968   /* If there are any SLP instances mark them as pure_slp.  */
1969   bool slp = vect_make_slp_decision (loop_vinfo);
1970   if (slp)
1971     {
1972       /* Find stmts that need to be both vectorized and SLPed.  */
1973       vect_detect_hybrid_slp (loop_vinfo);
1974
1975       /* Update the vectorization factor based on the SLP decision.  */
1976       vect_update_vf_for_slp (loop_vinfo);
1977     }
1978
1979   /* This is the point where we can re-start analysis with SLP forced off.  */
1980 start_over:
1981
1982   /* Now the vectorization factor is final.  */
1983   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1984   gcc_assert (known_ne (vectorization_factor, 0U));
1985   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1986
1987   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1988     {
1989       dump_printf_loc (MSG_NOTE, vect_location,
1990                        "vectorization_factor = ");
1991       dump_dec (MSG_NOTE, vectorization_factor);
1992       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1993                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1994     }
1995
1996   HOST_WIDE_INT max_niter
1997     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1998   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1999        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < assumed_vf))
2000       || (max_niter != -1
2001           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf))
2002     {
2003       if (dump_enabled_p ())
2004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005                          "not vectorized: iteration count smaller than "
2006                          "vectorization factor.\n");
2007       return false;
2008     }
2009
2010   /* Analyze the alignment of the data-refs in the loop.
2011      Fail if a data reference is found that cannot be vectorized.  */
2012
2013   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2014   if (!ok)
2015     {
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                          "bad data alignment.\n");
2019       return false;
2020     }
2021
2022   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2023      It is important to call pruning after vect_analyze_data_ref_accesses,
2024      since we use grouping information gathered by interleaving analysis.  */
2025   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2026   if (!ok)
2027     return false;
2028
2029   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2030      vectorization.  */
2031   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2032     {
2033     /* This pass will decide on using loop versioning and/or loop peeling in
2034        order to enhance the alignment of data references in the loop.  */
2035     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2036     if (!ok)
2037       {
2038         if (dump_enabled_p ())
2039           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040                            "bad data alignment.\n");
2041         return false;
2042       }
2043     }
2044
2045   if (slp)
2046     {
2047       /* Analyze operations in the SLP instances.  Note this may
2048          remove unsupported SLP instances which makes the above
2049          SLP kind detection invalid.  */
2050       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2051       vect_slp_analyze_operations (loop_vinfo);
2052       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2053         goto again;
2054     }
2055
2056   /* Scan all the remaining operations in the loop that are not subject
2057      to SLP and make sure they are vectorizable.  */
2058   ok = vect_analyze_loop_operations (loop_vinfo);
2059   if (!ok)
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063                          "bad operation or unsupported loop bound.\n");
2064       return false;
2065     }
2066
2067   /* If epilog loop is required because of data accesses with gaps,
2068      one additional iteration needs to be peeled.  Check if there is
2069      enough iterations for vectorization.  */
2070   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2071       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2072     {
2073       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2074       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2075
2076       if (known_lt (wi::to_widest (scalar_niters), vf))
2077         {
2078           if (dump_enabled_p ())
2079             dump_printf_loc (MSG_NOTE, vect_location,
2080                              "loop has no enough iterations to support"
2081                              " peeling for gaps.\n");
2082           return false;
2083         }
2084     }
2085
2086   /* Analyze cost.  Decide if worth while to vectorize.  */
2087   int min_profitable_estimate, min_profitable_iters;
2088   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2089                                       &min_profitable_estimate);
2090
2091   if (min_profitable_iters < 0)
2092     {
2093       if (dump_enabled_p ())
2094         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2095                          "not vectorized: vectorization not profitable.\n");
2096       if (dump_enabled_p ())
2097         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098                          "not vectorized: vector version will never be "
2099                          "profitable.\n");
2100       goto again;
2101     }
2102
2103   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2104                            * assumed_vf);
2105
2106   /* Use the cost model only if it is more conservative than user specified
2107      threshold.  */
2108   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2109
2110   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2111
2112   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2113       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2114     {
2115       if (dump_enabled_p ())
2116         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117                          "not vectorized: vectorization not profitable.\n");
2118       if (dump_enabled_p ())
2119         dump_printf_loc (MSG_NOTE, vect_location,
2120                          "not vectorized: iteration count smaller than user "
2121                          "specified loop bound parameter or minimum profitable "
2122                          "iterations (whichever is more conservative).\n");
2123       goto again;
2124     }
2125
2126   estimated_niter
2127     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2128   if (estimated_niter == -1)
2129     estimated_niter = max_niter;
2130   if (estimated_niter != -1
2131       && ((unsigned HOST_WIDE_INT) estimated_niter
2132           < MAX (th, (unsigned) min_profitable_estimate)))
2133     {
2134       if (dump_enabled_p ())
2135         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136                          "not vectorized: estimated iteration count too "
2137                          "small.\n");
2138       if (dump_enabled_p ())
2139         dump_printf_loc (MSG_NOTE, vect_location,
2140                          "not vectorized: estimated iteration count smaller "
2141                          "than specified loop bound parameter or minimum "
2142                          "profitable iterations (whichever is more "
2143                          "conservative).\n");
2144       goto again;
2145     }
2146
2147   /* Decide whether we need to create an epilogue loop to handle
2148      remaining scalar iterations.  */
2149   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2150
2151   unsigned HOST_WIDE_INT const_vf;
2152   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2153       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2154     {
2155       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2156                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2157                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2158         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2159     }
2160   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2161            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2162            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2163                 < (unsigned) exact_log2 (const_vf))
2164                /* In case of versioning, check if the maximum number of
2165                   iterations is greater than th.  If they are identical,
2166                   the epilogue is unnecessary.  */
2167                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2168                    || ((unsigned HOST_WIDE_INT) max_niter
2169                        > (th / const_vf) * const_vf))))
2170     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2171
2172   /* If an epilogue loop is required make sure we can create one.  */
2173   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2174       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2175     {
2176       if (dump_enabled_p ())
2177         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2178       if (!vect_can_advance_ivs_p (loop_vinfo)
2179           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2180                                            single_exit (LOOP_VINFO_LOOP
2181                                                          (loop_vinfo))))
2182         {
2183           if (dump_enabled_p ())
2184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185                              "not vectorized: can't create required "
2186                              "epilog loop\n");
2187           goto again;
2188         }
2189     }
2190
2191   /* During peeling, we need to check if number of loop iterations is
2192      enough for both peeled prolog loop and vector loop.  This check
2193      can be merged along with threshold check of loop versioning, so
2194      increase threshold for this case if necessary.  */
2195   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2196     {
2197       poly_uint64 niters_th;
2198
2199       /* Niters for peeled prolog loop.  */
2200       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2201         {
2202           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2203           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2204
2205           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2206         }
2207       else
2208         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2209
2210       /* Niters for at least one iteration of vectorized loop.  */
2211       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2212       /* One additional iteration because of peeling for gap.  */
2213       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2214         niters_th += 1;
2215       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2216     }
2217
2218   gcc_assert (known_eq (vectorization_factor,
2219                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2220
2221   /* Ok to vectorize!  */
2222   return true;
2223
2224 again:
2225   /* Try again with SLP forced off but if we didn't do any SLP there is
2226      no point in re-trying.  */
2227   if (!slp)
2228     return false;
2229
2230   /* If there are reduction chains re-trying will fail anyway.  */
2231   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2232     return false;
2233
2234   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2235      via interleaving or lane instructions.  */
2236   slp_instance instance;
2237   slp_tree node;
2238   unsigned i, j;
2239   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2240     {
2241       stmt_vec_info vinfo;
2242       vinfo = vinfo_for_stmt
2243           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2244       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2245         continue;
2246       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2247       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2248       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2249       if (! vect_store_lanes_supported (vectype, size)
2250           && ! vect_grouped_store_supported (vectype, size))
2251         return false;
2252       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2253         {
2254           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2255           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2256           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2257           size = STMT_VINFO_GROUP_SIZE (vinfo);
2258           vectype = STMT_VINFO_VECTYPE (vinfo);
2259           if (! vect_load_lanes_supported (vectype, size)
2260               && ! vect_grouped_load_supported (vectype, single_element_p,
2261                                                 size))
2262             return false;
2263         }
2264     }
2265
2266   if (dump_enabled_p ())
2267     dump_printf_loc (MSG_NOTE, vect_location,
2268                      "re-trying with SLP disabled\n");
2269
2270   /* Roll back state appropriately.  No SLP this time.  */
2271   slp = false;
2272   /* Restore vectorization factor as it were without SLP.  */
2273   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2274   /* Free the SLP instances.  */
2275   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2276     vect_free_slp_instance (instance);
2277   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2278   /* Reset SLP type to loop_vect on all stmts.  */
2279   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2280     {
2281       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2282       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2283            !gsi_end_p (si); gsi_next (&si))
2284         {
2285           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2286           STMT_SLP_TYPE (stmt_info) = loop_vect;
2287         }
2288       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2289            !gsi_end_p (si); gsi_next (&si))
2290         {
2291           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2292           STMT_SLP_TYPE (stmt_info) = loop_vect;
2293           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2294             {
2295               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2296               STMT_SLP_TYPE (stmt_info) = loop_vect;
2297               for (gimple_stmt_iterator pi
2298                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2299                    !gsi_end_p (pi); gsi_next (&pi))
2300                 {
2301                   gimple *pstmt = gsi_stmt (pi);
2302                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2303                 }
2304             }
2305         }
2306     }
2307   /* Free optimized alias test DDRS.  */
2308   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2309   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2310   /* Reset target cost data.  */
2311   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2312   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2313     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2314   /* Reset assorted flags.  */
2315   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2316   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2317   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2318   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2319
2320   goto start_over;
2321 }
2322
2323 /* Function vect_analyze_loop.
2324
2325    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2326    for it.  The different analyses will record information in the
2327    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2328    be vectorized.  */
2329 loop_vec_info
2330 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2331 {
2332   loop_vec_info loop_vinfo;
2333   unsigned int vector_sizes;
2334
2335   /* Autodetect first vector size we try.  */
2336   current_vector_size = 0;
2337   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2338
2339   if (dump_enabled_p ())
2340     dump_printf_loc (MSG_NOTE, vect_location,
2341                      "===== analyze_loop_nest =====\n");
2342
2343   if (loop_outer (loop)
2344       && loop_vec_info_for_loop (loop_outer (loop))
2345       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2346     {
2347       if (dump_enabled_p ())
2348         dump_printf_loc (MSG_NOTE, vect_location,
2349                          "outer-loop already vectorized.\n");
2350       return NULL;
2351     }
2352
2353   while (1)
2354     {
2355       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2356       loop_vinfo = vect_analyze_loop_form (loop);
2357       if (!loop_vinfo)
2358         {
2359           if (dump_enabled_p ())
2360             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                              "bad loop form.\n");
2362           return NULL;
2363         }
2364
2365       bool fatal = false;
2366
2367       if (orig_loop_vinfo)
2368         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2369
2370       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2371         {
2372           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2373
2374           return loop_vinfo;
2375         }
2376
2377       delete loop_vinfo;
2378
2379       vector_sizes &= ~current_vector_size;
2380       if (fatal
2381           || vector_sizes == 0
2382           || current_vector_size == 0)
2383         return NULL;
2384
2385       /* Try the next biggest vector size.  */
2386       current_vector_size = 1 << floor_log2 (vector_sizes);
2387       if (dump_enabled_p ())
2388         dump_printf_loc (MSG_NOTE, vect_location,
2389                          "***** Re-trying analysis with "
2390                          "vector size %d\n", current_vector_size);
2391     }
2392 }
2393
2394
2395 /* Function reduction_fn_for_scalar_code
2396
2397    Input:
2398    CODE - tree_code of a reduction operations.
2399
2400    Output:
2401    REDUC_FN - the corresponding internal function to be used to reduce the
2402       vector of partial results into a single scalar result, or IFN_LAST
2403       if the operation is a supported reduction operation, but does not have
2404       such an internal function.
2405
2406    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2407
2408 static bool
2409 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2410 {
2411   switch (code)
2412     {
2413       case MAX_EXPR:
2414         *reduc_fn = IFN_REDUC_MAX;
2415         return true;
2416
2417       case MIN_EXPR:
2418         *reduc_fn = IFN_REDUC_MIN;
2419         return true;
2420
2421       case PLUS_EXPR:
2422         *reduc_fn = IFN_REDUC_PLUS;
2423         return true;
2424
2425       case MULT_EXPR:
2426       case MINUS_EXPR:
2427       case BIT_IOR_EXPR:
2428       case BIT_XOR_EXPR:
2429       case BIT_AND_EXPR:
2430         *reduc_fn = IFN_LAST;
2431         return true;
2432
2433       default:
2434        return false;
2435     }
2436 }
2437
2438
2439 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2440    STMT is printed with a message MSG. */
2441
2442 static void
2443 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2444 {
2445   dump_printf_loc (msg_type, vect_location, "%s", msg);
2446   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2447 }
2448
2449
2450 /* Detect SLP reduction of the form:
2451
2452    #a1 = phi <a5, a0>
2453    a2 = operation (a1)
2454    a3 = operation (a2)
2455    a4 = operation (a3)
2456    a5 = operation (a4)
2457
2458    #a = phi <a5>
2459
2460    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2461    FIRST_STMT is the first reduction stmt in the chain
2462    (a2 = operation (a1)).
2463
2464    Return TRUE if a reduction chain was detected.  */
2465
2466 static bool
2467 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2468                        gimple *first_stmt)
2469 {
2470   struct loop *loop = (gimple_bb (phi))->loop_father;
2471   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2472   enum tree_code code;
2473   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2474   stmt_vec_info use_stmt_info, current_stmt_info;
2475   tree lhs;
2476   imm_use_iterator imm_iter;
2477   use_operand_p use_p;
2478   int nloop_uses, size = 0, n_out_of_loop_uses;
2479   bool found = false;
2480
2481   if (loop != vect_loop)
2482     return false;
2483
2484   lhs = PHI_RESULT (phi);
2485   code = gimple_assign_rhs_code (first_stmt);
2486   while (1)
2487     {
2488       nloop_uses = 0;
2489       n_out_of_loop_uses = 0;
2490       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2491         {
2492           gimple *use_stmt = USE_STMT (use_p);
2493           if (is_gimple_debug (use_stmt))
2494             continue;
2495
2496           /* Check if we got back to the reduction phi.  */
2497           if (use_stmt == phi)
2498             {
2499               loop_use_stmt = use_stmt;
2500               found = true;
2501               break;
2502             }
2503
2504           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2505             {
2506               loop_use_stmt = use_stmt;
2507               nloop_uses++;
2508             }
2509            else
2510              n_out_of_loop_uses++;
2511
2512            /* There are can be either a single use in the loop or two uses in
2513               phi nodes.  */
2514            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2515              return false;
2516         }
2517
2518       if (found)
2519         break;
2520
2521       /* We reached a statement with no loop uses.  */
2522       if (nloop_uses == 0)
2523         return false;
2524
2525       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2526       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2527         return false;
2528
2529       if (!is_gimple_assign (loop_use_stmt)
2530           || code != gimple_assign_rhs_code (loop_use_stmt)
2531           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2532         return false;
2533
2534       /* Insert USE_STMT into reduction chain.  */
2535       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2536       if (current_stmt)
2537         {
2538           current_stmt_info = vinfo_for_stmt (current_stmt);
2539           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2540           GROUP_FIRST_ELEMENT (use_stmt_info)
2541             = GROUP_FIRST_ELEMENT (current_stmt_info);
2542         }
2543       else
2544         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2545
2546       lhs = gimple_assign_lhs (loop_use_stmt);
2547       current_stmt = loop_use_stmt;
2548       size++;
2549    }
2550
2551   if (!found || loop_use_stmt != phi || size < 2)
2552     return false;
2553
2554   /* Swap the operands, if needed, to make the reduction operand be the second
2555      operand.  */
2556   lhs = PHI_RESULT (phi);
2557   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2558   while (next_stmt)
2559     {
2560       if (gimple_assign_rhs2 (next_stmt) == lhs)
2561         {
2562           tree op = gimple_assign_rhs1 (next_stmt);
2563           gimple *def_stmt = NULL;
2564
2565           if (TREE_CODE (op) == SSA_NAME)
2566             def_stmt = SSA_NAME_DEF_STMT (op);
2567
2568           /* Check that the other def is either defined in the loop
2569              ("vect_internal_def"), or it's an induction (defined by a
2570              loop-header phi-node).  */
2571           if (def_stmt
2572               && gimple_bb (def_stmt)
2573               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2574               && (is_gimple_assign (def_stmt)
2575                   || is_gimple_call (def_stmt)
2576                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2577                            == vect_induction_def
2578                   || (gimple_code (def_stmt) == GIMPLE_PHI
2579                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2580                                   == vect_internal_def
2581                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2582             {
2583               lhs = gimple_assign_lhs (next_stmt);
2584               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2585               continue;
2586             }
2587
2588           return false;
2589         }
2590       else
2591         {
2592           tree op = gimple_assign_rhs2 (next_stmt);
2593           gimple *def_stmt = NULL;
2594
2595           if (TREE_CODE (op) == SSA_NAME)
2596             def_stmt = SSA_NAME_DEF_STMT (op);
2597
2598           /* Check that the other def is either defined in the loop
2599             ("vect_internal_def"), or it's an induction (defined by a
2600             loop-header phi-node).  */
2601           if (def_stmt
2602               && gimple_bb (def_stmt)
2603               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2604               && (is_gimple_assign (def_stmt)
2605                   || is_gimple_call (def_stmt)
2606                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2607                               == vect_induction_def
2608                   || (gimple_code (def_stmt) == GIMPLE_PHI
2609                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2610                                   == vect_internal_def
2611                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2612             {
2613               if (dump_enabled_p ())
2614                 {
2615                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2616                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2617                 }
2618
2619               swap_ssa_operands (next_stmt,
2620                                  gimple_assign_rhs1_ptr (next_stmt),
2621                                  gimple_assign_rhs2_ptr (next_stmt));
2622               update_stmt (next_stmt);
2623
2624               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2625                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2626             }
2627           else
2628             return false;
2629         }
2630
2631       lhs = gimple_assign_lhs (next_stmt);
2632       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2633     }
2634
2635   /* Save the chain for further analysis in SLP detection.  */
2636   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2637   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2638   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2639
2640   return true;
2641 }
2642
2643
2644 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2645    reduction operation CODE has a handled computation expression.  */
2646
2647 bool
2648 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2649                       enum tree_code code)
2650 {
2651   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2652   auto_bitmap visited;
2653   tree lookfor = PHI_RESULT (phi);
2654   ssa_op_iter curri;
2655   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2656   while (USE_FROM_PTR (curr) != loop_arg)
2657     curr = op_iter_next_use (&curri);
2658   curri.i = curri.numops;
2659   do
2660     {
2661       path.safe_push (std::make_pair (curri, curr));
2662       tree use = USE_FROM_PTR (curr);
2663       if (use == lookfor)
2664         break;
2665       gimple *def = SSA_NAME_DEF_STMT (use);
2666       if (gimple_nop_p (def)
2667           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2668         {
2669 pop:
2670           do
2671             {
2672               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2673               curri = x.first;
2674               curr = x.second;
2675               do
2676                 curr = op_iter_next_use (&curri);
2677               /* Skip already visited or non-SSA operands (from iterating
2678                  over PHI args).  */
2679               while (curr != NULL_USE_OPERAND_P
2680                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2681                          || ! bitmap_set_bit (visited,
2682                                               SSA_NAME_VERSION
2683                                                 (USE_FROM_PTR (curr)))));
2684             }
2685           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2686           if (curr == NULL_USE_OPERAND_P)
2687             break;
2688         }
2689       else
2690         {
2691           if (gimple_code (def) == GIMPLE_PHI)
2692             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2693           else
2694             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2695           while (curr != NULL_USE_OPERAND_P
2696                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2697                      || ! bitmap_set_bit (visited,
2698                                           SSA_NAME_VERSION
2699                                             (USE_FROM_PTR (curr)))))
2700             curr = op_iter_next_use (&curri);
2701           if (curr == NULL_USE_OPERAND_P)
2702             goto pop;
2703         }
2704     }
2705   while (1);
2706   if (dump_file && (dump_flags & TDF_DETAILS))
2707     {
2708       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2709       unsigned i;
2710       std::pair<ssa_op_iter, use_operand_p> *x;
2711       FOR_EACH_VEC_ELT (path, i, x)
2712         {
2713           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2714           dump_printf (MSG_NOTE, " ");
2715         }
2716       dump_printf (MSG_NOTE, "\n");
2717     }
2718
2719   /* Check whether the reduction path detected is valid.  */
2720   bool fail = path.length () == 0;
2721   bool neg = false;
2722   for (unsigned i = 1; i < path.length (); ++i)
2723     {
2724       gimple *use_stmt = USE_STMT (path[i].second);
2725       tree op = USE_FROM_PTR (path[i].second);
2726       if (! has_single_use (op)
2727           || ! is_gimple_assign (use_stmt))
2728         {
2729           fail = true;
2730           break;
2731         }
2732       if (gimple_assign_rhs_code (use_stmt) != code)
2733         {
2734           if (code == PLUS_EXPR
2735               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2736             {
2737               /* Track whether we negate the reduction value each iteration.  */
2738               if (gimple_assign_rhs2 (use_stmt) == op)
2739                 neg = ! neg;
2740             }
2741           else
2742             {
2743               fail = true;
2744               break;
2745             }
2746         }
2747     }
2748   return ! fail && ! neg;
2749 }
2750
2751
2752 /* Function vect_is_simple_reduction
2753
2754    (1) Detect a cross-iteration def-use cycle that represents a simple
2755    reduction computation.  We look for the following pattern:
2756
2757    loop_header:
2758      a1 = phi < a0, a2 >
2759      a3 = ...
2760      a2 = operation (a3, a1)
2761
2762    or
2763
2764    a3 = ...
2765    loop_header:
2766      a1 = phi < a0, a2 >
2767      a2 = operation (a3, a1)
2768
2769    such that:
2770    1. operation is commutative and associative and it is safe to
2771       change the order of the computation
2772    2. no uses for a2 in the loop (a2 is used out of the loop)
2773    3. no uses of a1 in the loop besides the reduction operation
2774    4. no uses of a1 outside the loop.
2775
2776    Conditions 1,4 are tested here.
2777    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2778
2779    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2780    nested cycles.
2781
2782    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2783    reductions:
2784
2785      a1 = phi < a0, a2 >
2786      inner loop (def of a3)
2787      a2 = phi < a3 >
2788
2789    (4) Detect condition expressions, ie:
2790      for (int i = 0; i < N; i++)
2791        if (a[i] < val)
2792         ret_val = a[i];
2793
2794 */
2795
2796 static gimple *
2797 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2798                           bool *double_reduc,
2799                           bool need_wrapping_integral_overflow,
2800                           enum vect_reduction_type *v_reduc_type)
2801 {
2802   struct loop *loop = (gimple_bb (phi))->loop_father;
2803   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2804   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2805   enum tree_code orig_code, code;
2806   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2807   tree type;
2808   int nloop_uses;
2809   tree name;
2810   imm_use_iterator imm_iter;
2811   use_operand_p use_p;
2812   bool phi_def;
2813
2814   *double_reduc = false;
2815   *v_reduc_type = TREE_CODE_REDUCTION;
2816
2817   tree phi_name = PHI_RESULT (phi);
2818   /* ???  If there are no uses of the PHI result the inner loop reduction
2819      won't be detected as possibly double-reduction by vectorizable_reduction
2820      because that tries to walk the PHI arg from the preheader edge which
2821      can be constant.  See PR60382.  */
2822   if (has_zero_uses (phi_name))
2823     return NULL;
2824   nloop_uses = 0;
2825   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2826     {
2827       gimple *use_stmt = USE_STMT (use_p);
2828       if (is_gimple_debug (use_stmt))
2829         continue;
2830
2831       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2832         {
2833           if (dump_enabled_p ())
2834             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2835                              "intermediate value used outside loop.\n");
2836
2837           return NULL;
2838         }
2839
2840       nloop_uses++;
2841       if (nloop_uses > 1)
2842         {
2843           if (dump_enabled_p ())
2844             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2845                              "reduction value used in loop.\n");
2846           return NULL;
2847         }
2848
2849       phi_use_stmt = use_stmt;
2850     }
2851
2852   edge latch_e = loop_latch_edge (loop);
2853   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2854   if (TREE_CODE (loop_arg) != SSA_NAME)
2855     {
2856       if (dump_enabled_p ())
2857         {
2858           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2859                            "reduction: not ssa_name: ");
2860           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2861           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2862         }
2863       return NULL;
2864     }
2865
2866   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2867   if (is_gimple_assign (def_stmt))
2868     {
2869       name = gimple_assign_lhs (def_stmt);
2870       phi_def = false;
2871     }
2872   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2873     {
2874       name = PHI_RESULT (def_stmt);
2875       phi_def = true;
2876     }
2877   else
2878     {
2879       if (dump_enabled_p ())
2880         {
2881           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2882                            "reduction: unhandled reduction operation: ");
2883           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2884         }
2885       return NULL;
2886     }
2887
2888   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2889     return NULL;
2890
2891   nloop_uses = 0;
2892   auto_vec<gphi *, 3> lcphis;
2893   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2894     {
2895       gimple *use_stmt = USE_STMT (use_p);
2896       if (is_gimple_debug (use_stmt))
2897         continue;
2898       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2899         nloop_uses++;
2900       else
2901         /* We can have more than one loop-closed PHI.  */
2902         lcphis.safe_push (as_a <gphi *> (use_stmt));
2903       if (nloop_uses > 1)
2904         {
2905           if (dump_enabled_p ())
2906             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2907                              "reduction used in loop.\n");
2908           return NULL;
2909         }
2910     }
2911
2912   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2913      defined in the inner loop.  */
2914   if (phi_def)
2915     {
2916       op1 = PHI_ARG_DEF (def_stmt, 0);
2917
2918       if (gimple_phi_num_args (def_stmt) != 1
2919           || TREE_CODE (op1) != SSA_NAME)
2920         {
2921           if (dump_enabled_p ())
2922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2923                              "unsupported phi node definition.\n");
2924
2925           return NULL;
2926         }
2927
2928       def1 = SSA_NAME_DEF_STMT (op1);
2929       if (gimple_bb (def1)
2930           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2931           && loop->inner
2932           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2933           && is_gimple_assign (def1)
2934           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2935         {
2936           if (dump_enabled_p ())
2937             report_vect_op (MSG_NOTE, def_stmt,
2938                             "detected double reduction: ");
2939
2940           *double_reduc = true;
2941           return def_stmt;
2942         }
2943
2944       return NULL;
2945     }
2946
2947   /* If we are vectorizing an inner reduction we are executing that
2948      in the original order only in case we are not dealing with a
2949      double reduction.  */
2950   bool check_reduction = true;
2951   if (flow_loop_nested_p (vect_loop, loop))
2952     {
2953       gphi *lcphi;
2954       unsigned i;
2955       check_reduction = false;
2956       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2957         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2958           {
2959             gimple *use_stmt = USE_STMT (use_p);
2960             if (is_gimple_debug (use_stmt))
2961               continue;
2962             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2963               check_reduction = true;
2964           }
2965     }
2966
2967   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2968   code = orig_code = gimple_assign_rhs_code (def_stmt);
2969
2970   /* We can handle "res -= x[i]", which is non-associative by
2971      simply rewriting this into "res += -x[i]".  Avoid changing
2972      gimple instruction for the first simple tests and only do this
2973      if we're allowed to change code at all.  */
2974   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2975     code = PLUS_EXPR;
2976
2977   if (code == COND_EXPR)
2978     {
2979       if (! nested_in_vect_loop)
2980         *v_reduc_type = COND_REDUCTION;
2981
2982       op3 = gimple_assign_rhs1 (def_stmt);
2983       if (COMPARISON_CLASS_P (op3))
2984         {
2985           op4 = TREE_OPERAND (op3, 1);
2986           op3 = TREE_OPERAND (op3, 0);
2987         }
2988       if (op3 == phi_name || op4 == phi_name)
2989         {
2990           if (dump_enabled_p ())
2991             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2992                             "reduction: condition depends on previous"
2993                             " iteration: ");
2994           return NULL;
2995         }
2996
2997       op1 = gimple_assign_rhs2 (def_stmt);
2998       op2 = gimple_assign_rhs3 (def_stmt);
2999     }
3000   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3001     {
3002       if (dump_enabled_p ())
3003         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3004                         "reduction: not commutative/associative: ");
3005       return NULL;
3006     }
3007   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3008     {
3009       op1 = gimple_assign_rhs1 (def_stmt);
3010       op2 = gimple_assign_rhs2 (def_stmt);
3011     }
3012   else
3013     {
3014       if (dump_enabled_p ())
3015         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3016                         "reduction: not handled operation: ");
3017       return NULL;
3018     }
3019
3020   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3021     {
3022       if (dump_enabled_p ())
3023         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3024                         "reduction: both uses not ssa_names: ");
3025
3026       return NULL;
3027     }
3028
3029   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3030   if ((TREE_CODE (op1) == SSA_NAME
3031        && !types_compatible_p (type,TREE_TYPE (op1)))
3032       || (TREE_CODE (op2) == SSA_NAME
3033           && !types_compatible_p (type, TREE_TYPE (op2)))
3034       || (op3 && TREE_CODE (op3) == SSA_NAME
3035           && !types_compatible_p (type, TREE_TYPE (op3)))
3036       || (op4 && TREE_CODE (op4) == SSA_NAME
3037           && !types_compatible_p (type, TREE_TYPE (op4))))
3038     {
3039       if (dump_enabled_p ())
3040         {
3041           dump_printf_loc (MSG_NOTE, vect_location,
3042                            "reduction: multiple types: operation type: ");
3043           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3044           dump_printf (MSG_NOTE, ", operands types: ");
3045           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3046                              TREE_TYPE (op1));
3047           dump_printf (MSG_NOTE, ",");
3048           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3049                              TREE_TYPE (op2));
3050           if (op3)
3051             {
3052               dump_printf (MSG_NOTE, ",");
3053               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3054                                  TREE_TYPE (op3));
3055             }
3056
3057           if (op4)
3058             {
3059               dump_printf (MSG_NOTE, ",");
3060               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3061                                  TREE_TYPE (op4));
3062             }
3063           dump_printf (MSG_NOTE, "\n");
3064         }
3065
3066       return NULL;
3067     }
3068
3069   /* Check that it's ok to change the order of the computation.
3070      Generally, when vectorizing a reduction we change the order of the
3071      computation.  This may change the behavior of the program in some
3072      cases, so we need to check that this is ok.  One exception is when
3073      vectorizing an outer-loop: the inner-loop is executed sequentially,
3074      and therefore vectorizing reductions in the inner-loop during
3075      outer-loop vectorization is safe.  */
3076
3077   if (*v_reduc_type != COND_REDUCTION
3078       && check_reduction)
3079     {
3080       /* CHECKME: check for !flag_finite_math_only too?  */
3081       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3082         {
3083           /* Changing the order of operations changes the semantics.  */
3084           if (dump_enabled_p ())
3085             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3086                         "reduction: unsafe fp math optimization: ");
3087           return NULL;
3088         }
3089       else if (INTEGRAL_TYPE_P (type))
3090         {
3091           if (!operation_no_trapping_overflow (type, code))
3092             {
3093               /* Changing the order of operations changes the semantics.  */
3094               if (dump_enabled_p ())
3095                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3096                                 "reduction: unsafe int math optimization"
3097                                 " (overflow traps): ");
3098               return NULL;
3099             }
3100           if (need_wrapping_integral_overflow
3101               && !TYPE_OVERFLOW_WRAPS (type)
3102               && operation_can_overflow (code))
3103             {
3104               /* Changing the order of operations changes the semantics.  */
3105               if (dump_enabled_p ())
3106                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3107                                 "reduction: unsafe int math optimization"
3108                                 " (overflow doesn't wrap): ");
3109               return NULL;
3110             }
3111         }
3112       else if (SAT_FIXED_POINT_TYPE_P (type))
3113         {
3114           /* Changing the order of operations changes the semantics.  */
3115           if (dump_enabled_p ())
3116           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3117                           "reduction: unsafe fixed-point math optimization: ");
3118           return NULL;
3119         }
3120     }
3121
3122   /* Reduction is safe. We're dealing with one of the following:
3123      1) integer arithmetic and no trapv
3124      2) floating point arithmetic, and special flags permit this optimization
3125      3) nested cycle (i.e., outer loop vectorization).  */
3126   if (TREE_CODE (op1) == SSA_NAME)
3127     def1 = SSA_NAME_DEF_STMT (op1);
3128
3129   if (TREE_CODE (op2) == SSA_NAME)
3130     def2 = SSA_NAME_DEF_STMT (op2);
3131
3132   if (code != COND_EXPR
3133       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3134     {
3135       if (dump_enabled_p ())
3136         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3137       return NULL;
3138     }
3139
3140   /* Check that one def is the reduction def, defined by PHI,
3141      the other def is either defined in the loop ("vect_internal_def"),
3142      or it's an induction (defined by a loop-header phi-node).  */
3143
3144   if (def2 && def2 == phi
3145       && (code == COND_EXPR
3146           || !def1 || gimple_nop_p (def1)
3147           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3148           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3149               && (is_gimple_assign (def1)
3150                   || is_gimple_call (def1)
3151                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3152                       == vect_induction_def
3153                   || (gimple_code (def1) == GIMPLE_PHI
3154                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3155                           == vect_internal_def
3156                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3157     {
3158       if (dump_enabled_p ())
3159         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3160       return def_stmt;
3161     }
3162
3163   if (def1 && def1 == phi
3164       && (code == COND_EXPR
3165           || !def2 || gimple_nop_p (def2)
3166           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3167           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3168               && (is_gimple_assign (def2)
3169                   || is_gimple_call (def2)
3170                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3171                        == vect_induction_def
3172                   || (gimple_code (def2) == GIMPLE_PHI
3173                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3174                            == vect_internal_def
3175                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3176     {
3177       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3178         {
3179           /* Check if we can swap operands (just for simplicity - so that
3180              the rest of the code can assume that the reduction variable
3181              is always the last (second) argument).  */
3182           if (code == COND_EXPR)
3183             {
3184               /* Swap cond_expr by inverting the condition.  */
3185               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3186               enum tree_code invert_code = ERROR_MARK;
3187               enum tree_code cond_code = TREE_CODE (cond_expr);
3188
3189               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3190                 {
3191                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3192                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3193                 }
3194               if (invert_code != ERROR_MARK)
3195                 {
3196                   TREE_SET_CODE (cond_expr, invert_code);
3197                   swap_ssa_operands (def_stmt,
3198                                      gimple_assign_rhs2_ptr (def_stmt),
3199                                      gimple_assign_rhs3_ptr (def_stmt));
3200                 }
3201               else
3202                 {
3203                   if (dump_enabled_p ())
3204                     report_vect_op (MSG_NOTE, def_stmt,
3205                                     "detected reduction: cannot swap operands "
3206                                     "for cond_expr");
3207                   return NULL;
3208                 }
3209             }
3210           else
3211             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3212                                gimple_assign_rhs2_ptr (def_stmt));
3213
3214           if (dump_enabled_p ())
3215             report_vect_op (MSG_NOTE, def_stmt,
3216                             "detected reduction: need to swap operands: ");
3217
3218           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3219             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3220         }
3221       else
3222         {
3223           if (dump_enabled_p ())
3224             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3225         }
3226
3227       return def_stmt;
3228     }
3229
3230   /* Try to find SLP reduction chain.  */
3231   if (! nested_in_vect_loop
3232       && code != COND_EXPR
3233       && orig_code != MINUS_EXPR
3234       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3235     {
3236       if (dump_enabled_p ())
3237         report_vect_op (MSG_NOTE, def_stmt,
3238                         "reduction: detected reduction chain: ");
3239
3240       return def_stmt;
3241     }
3242
3243   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3244   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3245   while (first)
3246     {
3247       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3248       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3249       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3250       first = next;
3251     }
3252
3253   /* Look for the expression computing loop_arg from loop PHI result.  */
3254   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3255                             code))
3256     return def_stmt;
3257
3258   if (dump_enabled_p ())
3259     {
3260       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3261                       "reduction: unknown pattern: ");
3262     }
3263
3264   return NULL;
3265 }
3266
3267 /* Wrapper around vect_is_simple_reduction, which will modify code
3268    in-place if it enables detection of more reductions.  Arguments
3269    as there.  */
3270
3271 gimple *
3272 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3273                              bool *double_reduc,
3274                              bool need_wrapping_integral_overflow)
3275 {
3276   enum vect_reduction_type v_reduc_type;
3277   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3278                                           need_wrapping_integral_overflow,
3279                                           &v_reduc_type);
3280   if (def)
3281     {
3282       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3283       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3284       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3285       reduc_def_info = vinfo_for_stmt (def);
3286       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3287     }
3288   return def;
3289 }
3290
3291 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3292 int
3293 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3294                              int *peel_iters_epilogue,
3295                              stmt_vector_for_cost *scalar_cost_vec,
3296                              stmt_vector_for_cost *prologue_cost_vec,
3297                              stmt_vector_for_cost *epilogue_cost_vec)
3298 {
3299   int retval = 0;
3300   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3301
3302   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3303     {
3304       *peel_iters_epilogue = assumed_vf / 2;
3305       if (dump_enabled_p ())
3306         dump_printf_loc (MSG_NOTE, vect_location,
3307                          "cost model: epilogue peel iters set to vf/2 "
3308                          "because loop iterations are unknown .\n");
3309
3310       /* If peeled iterations are known but number of scalar loop
3311          iterations are unknown, count a taken branch per peeled loop.  */
3312       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3313                                  NULL, 0, vect_prologue);
3314       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3315                                  NULL, 0, vect_epilogue);
3316     }
3317   else
3318     {
3319       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3320       peel_iters_prologue = niters < peel_iters_prologue ?
3321                             niters : peel_iters_prologue;
3322       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3323       /* If we need to peel for gaps, but no peeling is required, we have to
3324          peel VF iterations.  */
3325       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3326         *peel_iters_epilogue = assumed_vf;
3327     }
3328
3329   stmt_info_for_cost *si;
3330   int j;
3331   if (peel_iters_prologue)
3332     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3333         {
3334           stmt_vec_info stmt_info
3335             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3336           retval += record_stmt_cost (prologue_cost_vec,
3337                                       si->count * peel_iters_prologue,
3338                                       si->kind, stmt_info, si->misalign,
3339                                       vect_prologue);
3340         }
3341   if (*peel_iters_epilogue)
3342     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3343         {
3344           stmt_vec_info stmt_info
3345             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3346           retval += record_stmt_cost (epilogue_cost_vec,
3347                                       si->count * *peel_iters_epilogue,
3348                                       si->kind, stmt_info, si->misalign,
3349                                       vect_epilogue);
3350         }
3351
3352   return retval;
3353 }
3354
3355 /* Function vect_estimate_min_profitable_iters
3356
3357    Return the number of iterations required for the vector version of the
3358    loop to be profitable relative to the cost of the scalar version of the
3359    loop.
3360
3361    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3362    of iterations for vectorization.  -1 value means loop vectorization
3363    is not profitable.  This returned value may be used for dynamic
3364    profitability check.
3365
3366    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3367    for static check against estimated number of iterations.  */
3368
3369 static void
3370 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3371                                     int *ret_min_profitable_niters,
3372                                     int *ret_min_profitable_estimate)
3373 {
3374   int min_profitable_iters;
3375   int min_profitable_estimate;
3376   int peel_iters_prologue;
3377   int peel_iters_epilogue;
3378   unsigned vec_inside_cost = 0;
3379   int vec_outside_cost = 0;
3380   unsigned vec_prologue_cost = 0;
3381   unsigned vec_epilogue_cost = 0;
3382   int scalar_single_iter_cost = 0;
3383   int scalar_outside_cost = 0;
3384   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3385   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3386   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3387
3388   /* Cost model disabled.  */
3389   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3390     {
3391       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3392       *ret_min_profitable_niters = 0;
3393       *ret_min_profitable_estimate = 0;
3394       return;
3395     }
3396
3397   /* Requires loop versioning tests to handle misalignment.  */
3398   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3399     {
3400       /*  FIXME: Make cost depend on complexity of individual check.  */
3401       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3402       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3403                             vect_prologue);
3404       dump_printf (MSG_NOTE,
3405                    "cost model: Adding cost of checks for loop "
3406                    "versioning to treat misalignment.\n");
3407     }
3408
3409   /* Requires loop versioning with alias checks.  */
3410   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3411     {
3412       /*  FIXME: Make cost depend on complexity of individual check.  */
3413       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3414       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3415                             vect_prologue);
3416       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3417       if (len)
3418         /* Count LEN - 1 ANDs and LEN comparisons.  */
3419         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3420                               NULL, 0, vect_prologue);
3421       dump_printf (MSG_NOTE,
3422                    "cost model: Adding cost of checks for loop "
3423                    "versioning aliasing.\n");
3424     }
3425
3426   /* Requires loop versioning with niter checks.  */
3427   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3428     {
3429       /*  FIXME: Make cost depend on complexity of individual check.  */
3430       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3431                             vect_prologue);
3432       dump_printf (MSG_NOTE,
3433                    "cost model: Adding cost of checks for loop "
3434                    "versioning niters.\n");
3435     }
3436
3437   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3438     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3439                           vect_prologue);
3440
3441   /* Count statements in scalar loop.  Using this as scalar cost for a single
3442      iteration for now.
3443
3444      TODO: Add outer loop support.
3445
3446      TODO: Consider assigning different costs to different scalar
3447      statements.  */
3448
3449   scalar_single_iter_cost
3450     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3451
3452   /* Add additional cost for the peeled instructions in prologue and epilogue
3453      loop.
3454
3455      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3456      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3457
3458      TODO: Build an expression that represents peel_iters for prologue and
3459      epilogue to be used in a run-time test.  */
3460
3461   if (npeel  < 0)
3462     {
3463       peel_iters_prologue = assumed_vf / 2;
3464       dump_printf (MSG_NOTE, "cost model: "
3465                    "prologue peel iters set to vf/2.\n");
3466
3467       /* If peeling for alignment is unknown, loop bound of main loop becomes
3468          unknown.  */
3469       peel_iters_epilogue = assumed_vf / 2;
3470       dump_printf (MSG_NOTE, "cost model: "
3471                    "epilogue peel iters set to vf/2 because "
3472                    "peeling for alignment is unknown.\n");
3473
3474       /* If peeled iterations are unknown, count a taken branch and a not taken
3475          branch per peeled loop. Even if scalar loop iterations are known,
3476          vector iterations are not known since peeled prologue iterations are
3477          not known. Hence guards remain the same.  */
3478       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3479                             NULL, 0, vect_prologue);
3480       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3481                             NULL, 0, vect_prologue);
3482       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3483                             NULL, 0, vect_epilogue);
3484       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3485                             NULL, 0, vect_epilogue);
3486       stmt_info_for_cost *si;
3487       int j;
3488       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3489         {
3490           struct _stmt_vec_info *stmt_info
3491             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3492           (void) add_stmt_cost (target_cost_data,
3493                                 si->count * peel_iters_prologue,
3494                                 si->kind, stmt_info, si->misalign,
3495                                 vect_prologue);
3496           (void) add_stmt_cost (target_cost_data,
3497                                 si->count * peel_iters_epilogue,
3498                                 si->kind, stmt_info, si->misalign,
3499                                 vect_epilogue);
3500         }
3501     }
3502   else
3503     {
3504       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3505       stmt_info_for_cost *si;
3506       int j;
3507       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3508
3509       prologue_cost_vec.create (2);
3510       epilogue_cost_vec.create (2);
3511       peel_iters_prologue = npeel;
3512
3513       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3514                                           &peel_iters_epilogue,
3515                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3516                                             (loop_vinfo),
3517                                           &prologue_cost_vec,
3518                                           &epilogue_cost_vec);
3519
3520       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3521         {
3522           struct _stmt_vec_info *stmt_info
3523             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3524           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3525                                 si->misalign, vect_prologue);
3526         }
3527
3528       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3529         {
3530           struct _stmt_vec_info *stmt_info
3531             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3532           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3533                                 si->misalign, vect_epilogue);
3534         }
3535
3536       prologue_cost_vec.release ();
3537       epilogue_cost_vec.release ();
3538     }
3539
3540   /* FORNOW: The scalar outside cost is incremented in one of the
3541      following ways:
3542
3543      1. The vectorizer checks for alignment and aliasing and generates
3544      a condition that allows dynamic vectorization.  A cost model
3545      check is ANDED with the versioning condition.  Hence scalar code
3546      path now has the added cost of the versioning check.
3547
3548        if (cost > th & versioning_check)
3549          jmp to vector code
3550
3551      Hence run-time scalar is incremented by not-taken branch cost.
3552
3553      2. The vectorizer then checks if a prologue is required.  If the
3554      cost model check was not done before during versioning, it has to
3555      be done before the prologue check.
3556
3557        if (cost <= th)
3558          prologue = scalar_iters
3559        if (prologue == 0)
3560          jmp to vector code
3561        else
3562          execute prologue
3563        if (prologue == num_iters)
3564          go to exit
3565
3566      Hence the run-time scalar cost is incremented by a taken branch,
3567      plus a not-taken branch, plus a taken branch cost.
3568
3569      3. The vectorizer then checks if an epilogue is required.  If the
3570      cost model check was not done before during prologue check, it
3571      has to be done with the epilogue check.
3572
3573        if (prologue == 0)
3574          jmp to vector code
3575        else
3576          execute prologue
3577        if (prologue == num_iters)
3578          go to exit
3579        vector code:
3580          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3581            jmp to epilogue
3582
3583      Hence the run-time scalar cost should be incremented by 2 taken
3584      branches.
3585
3586      TODO: The back end may reorder the BBS's differently and reverse
3587      conditions/branch directions.  Change the estimates below to
3588      something more reasonable.  */
3589
3590   /* If the number of iterations is known and we do not do versioning, we can
3591      decide whether to vectorize at compile time.  Hence the scalar version
3592      do not carry cost model guard costs.  */
3593   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3594       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3595     {
3596       /* Cost model check occurs at versioning.  */
3597       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3598         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3599       else
3600         {
3601           /* Cost model check occurs at prologue generation.  */
3602           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3603             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3604               + vect_get_stmt_cost (cond_branch_not_taken);
3605           /* Cost model check occurs at epilogue generation.  */
3606           else
3607             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3608         }
3609     }
3610
3611   /* Complete the target-specific cost calculations.  */
3612   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3613                &vec_inside_cost, &vec_epilogue_cost);
3614
3615   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3616
3617   if (dump_enabled_p ())
3618     {
3619       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3620       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3621                    vec_inside_cost);
3622       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3623                    vec_prologue_cost);
3624       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3625                    vec_epilogue_cost);
3626       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3627                    scalar_single_iter_cost);
3628       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3629                    scalar_outside_cost);
3630       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3631                    vec_outside_cost);
3632       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3633                    peel_iters_prologue);
3634       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3635                    peel_iters_epilogue);
3636     }
3637
3638   /* Calculate number of iterations required to make the vector version
3639      profitable, relative to the loop bodies only.  The following condition
3640      must hold true:
3641      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3642      where
3643      SIC = scalar iteration cost, VIC = vector iteration cost,
3644      VOC = vector outside cost, VF = vectorization factor,
3645      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3646      SOC = scalar outside cost for run time cost model check.  */
3647
3648   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3649     {
3650       if (vec_outside_cost <= 0)
3651         min_profitable_iters = 0;
3652       else
3653         {
3654           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3655                                   * assumed_vf
3656                                   - vec_inside_cost * peel_iters_prologue
3657                                   - vec_inside_cost * peel_iters_epilogue)
3658                                  / ((scalar_single_iter_cost * assumed_vf)
3659                                     - vec_inside_cost);
3660
3661           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3662               <= (((int) vec_inside_cost * min_profitable_iters)
3663                   + (((int) vec_outside_cost - scalar_outside_cost)
3664                      * assumed_vf)))
3665             min_profitable_iters++;
3666         }
3667     }
3668   /* vector version will never be profitable.  */
3669   else
3670     {
3671       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3672         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3673                     "did not happen for a simd loop");
3674
3675       if (dump_enabled_p ())
3676         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3677                          "cost model: the vector iteration cost = %d "
3678                          "divided by the scalar iteration cost = %d "
3679                          "is greater or equal to the vectorization factor = %d"
3680                          ".\n",
3681                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3682       *ret_min_profitable_niters = -1;
3683       *ret_min_profitable_estimate = -1;
3684       return;
3685     }
3686
3687   dump_printf (MSG_NOTE,
3688                "  Calculated minimum iters for profitability: %d\n",
3689                min_profitable_iters);
3690
3691   /* We want the vectorized loop to execute at least once.  */
3692   if (min_profitable_iters < (assumed_vf + peel_iters_prologue))
3693     min_profitable_iters = assumed_vf + peel_iters_prologue;
3694
3695   if (dump_enabled_p ())
3696     dump_printf_loc (MSG_NOTE, vect_location,
3697                      "  Runtime profitability threshold = %d\n",
3698                      min_profitable_iters);
3699
3700   *ret_min_profitable_niters = min_profitable_iters;
3701
3702   /* Calculate number of iterations required to make the vector version
3703      profitable, relative to the loop bodies only.
3704
3705      Non-vectorized variant is SIC * niters and it must win over vector
3706      variant on the expected loop trip count.  The following condition must hold true:
3707      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3708
3709   if (vec_outside_cost <= 0)
3710     min_profitable_estimate = 0;
3711   else
3712     {
3713       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3714                                  * assumed_vf
3715                                  - vec_inside_cost * peel_iters_prologue
3716                                  - vec_inside_cost * peel_iters_epilogue)
3717                                  / ((scalar_single_iter_cost * assumed_vf)
3718                                    - vec_inside_cost);
3719     }
3720   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3721   if (dump_enabled_p ())
3722     dump_printf_loc (MSG_NOTE, vect_location,
3723                      "  Static estimate profitability threshold = %d\n",
3724                      min_profitable_estimate);
3725
3726   *ret_min_profitable_estimate = min_profitable_estimate;
3727 }
3728
3729 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3730    vector elements (not bits) for a vector with NELT elements.  */
3731 static void
3732 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3733                               vec_perm_builder *sel)
3734 {
3735   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3736      by vec_perm_indices.  */
3737   sel->new_vector (nelt, 1, 3);
3738   for (unsigned int i = 0; i < 3; i++)
3739     sel->quick_push (i + offset);
3740 }
3741
3742 /* Checks whether the target supports whole-vector shifts for vectors of mode
3743    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3744    it supports vec_perm_const with masks for all necessary shift amounts.  */
3745 static bool
3746 have_whole_vector_shift (machine_mode mode)
3747 {
3748   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3749     return true;
3750
3751   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3752   vec_perm_builder sel;
3753   vec_perm_indices indices;
3754   for (i = nelt/2; i >= 1; i/=2)
3755     {
3756       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3757       indices.new_vector (sel, 2, nelt);
3758       if (!can_vec_perm_const_p (mode, indices, false))
3759         return false;
3760     }
3761   return true;
3762 }
3763
3764 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3765    functions. Design better to avoid maintenance issues.  */
3766
3767 /* Function vect_model_reduction_cost.
3768
3769    Models cost for a reduction operation, including the vector ops
3770    generated within the strip-mine loop, the initial definition before
3771    the loop, and the epilogue code that must be generated.  */
3772
3773 static void
3774 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3775                            int ncopies)
3776 {
3777   int prologue_cost = 0, epilogue_cost = 0;
3778   enum tree_code code;
3779   optab optab;
3780   tree vectype;
3781   gimple *orig_stmt;
3782   machine_mode mode;
3783   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3784   struct loop *loop = NULL;
3785   void *target_cost_data;
3786
3787   if (loop_vinfo)
3788     {
3789       loop = LOOP_VINFO_LOOP (loop_vinfo);
3790       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3791     }
3792   else
3793     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3794
3795   /* Condition reductions generate two reductions in the loop.  */
3796   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3797     ncopies *= 2;
3798
3799   /* Cost of reduction op inside loop.  */
3800   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3801                                         stmt_info, 0, vect_body);
3802
3803   vectype = STMT_VINFO_VECTYPE (stmt_info);
3804   mode = TYPE_MODE (vectype);
3805   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3806
3807   if (!orig_stmt)
3808     orig_stmt = STMT_VINFO_STMT (stmt_info);
3809
3810   code = gimple_assign_rhs_code (orig_stmt);
3811
3812   /* Add in cost for initial definition.
3813      For cond reduction we have four vectors: initial index, step, initial
3814      result of the data reduction, initial value of the index reduction.  */
3815   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3816                        == COND_REDUCTION ? 4 : 1;
3817   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3818                                   scalar_to_vec, stmt_info, 0,
3819                                   vect_prologue);
3820
3821   /* Determine cost of epilogue code.
3822
3823      We have a reduction operator that will reduce the vector in one statement.
3824      Also requires scalar extract.  */
3825
3826   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3827     {
3828       if (reduc_fn != IFN_LAST)
3829         {
3830           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3831             {
3832               /* An EQ stmt and an COND_EXPR stmt.  */
3833               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3834                                               vector_stmt, stmt_info, 0,
3835                                               vect_epilogue);
3836               /* Reduction of the max index and a reduction of the found
3837                  values.  */
3838               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3839                                               vec_to_scalar, stmt_info, 0,
3840                                               vect_epilogue);
3841               /* A broadcast of the max value.  */
3842               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3843                                               scalar_to_vec, stmt_info, 0,
3844                                               vect_epilogue);
3845             }
3846           else
3847             {
3848               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3849                                               stmt_info, 0, vect_epilogue);
3850               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3851                                               vec_to_scalar, stmt_info, 0,
3852                                               vect_epilogue);
3853             }
3854         }
3855       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3856         {
3857           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3858           /* Extraction of scalar elements.  */
3859           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3860                                           vec_to_scalar, stmt_info, 0,
3861                                           vect_epilogue);
3862           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3863           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3864                                           scalar_stmt, stmt_info, 0,
3865                                           vect_epilogue);
3866         }
3867       else
3868         {
3869           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3870           tree bitsize =
3871             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3872           int element_bitsize = tree_to_uhwi (bitsize);
3873           int nelements = vec_size_in_bits / element_bitsize;
3874
3875           if (code == COND_EXPR)
3876             code = MAX_EXPR;
3877
3878           optab = optab_for_tree_code (code, vectype, optab_default);
3879
3880           /* We have a whole vector shift available.  */
3881           if (optab != unknown_optab
3882               && VECTOR_MODE_P (mode)
3883               && optab_handler (optab, mode) != CODE_FOR_nothing
3884               && have_whole_vector_shift (mode))
3885             {
3886               /* Final reduction via vector shifts and the reduction operator.
3887                  Also requires scalar extract.  */
3888               epilogue_cost += add_stmt_cost (target_cost_data,
3889                                               exact_log2 (nelements) * 2,
3890                                               vector_stmt, stmt_info, 0,
3891                                               vect_epilogue);
3892               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3893                                               vec_to_scalar, stmt_info, 0,
3894                                               vect_epilogue);
3895             }
3896           else
3897             /* Use extracts and reduction op for final reduction.  For N
3898                elements, we have N extracts and N-1 reduction ops.  */
3899             epilogue_cost += add_stmt_cost (target_cost_data,
3900                                             nelements + nelements - 1,
3901                                             vector_stmt, stmt_info, 0,
3902                                             vect_epilogue);
3903         }
3904     }
3905
3906   if (dump_enabled_p ())
3907     dump_printf (MSG_NOTE,
3908                  "vect_model_reduction_cost: inside_cost = %d, "
3909                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3910                  prologue_cost, epilogue_cost);
3911 }
3912
3913
3914 /* Function vect_model_induction_cost.
3915
3916    Models cost for induction operations.  */
3917
3918 static void
3919 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3920 {
3921   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3922   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3923   unsigned inside_cost, prologue_cost;
3924
3925   if (PURE_SLP_STMT (stmt_info))
3926     return;
3927
3928   /* loop cost for vec_loop.  */
3929   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3930                                stmt_info, 0, vect_body);
3931
3932   /* prologue cost for vec_init and vec_step.  */
3933   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3934                                  stmt_info, 0, vect_prologue);
3935
3936   if (dump_enabled_p ())
3937     dump_printf_loc (MSG_NOTE, vect_location,
3938                      "vect_model_induction_cost: inside_cost = %d, "
3939                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3940 }
3941
3942
3943
3944 /* Function get_initial_def_for_reduction
3945
3946    Input:
3947    STMT - a stmt that performs a reduction operation in the loop.
3948    INIT_VAL - the initial value of the reduction variable
3949
3950    Output:
3951    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3952         of the reduction (used for adjusting the epilog - see below).
3953    Return a vector variable, initialized according to the operation that STMT
3954         performs. This vector will be used as the initial value of the
3955         vector of partial results.
3956
3957    Option1 (adjust in epilog): Initialize the vector as follows:
3958      add/bit or/xor:    [0,0,...,0,0]
3959      mult/bit and:      [1,1,...,1,1]
3960      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3961    and when necessary (e.g. add/mult case) let the caller know
3962    that it needs to adjust the result by init_val.
3963
3964    Option2: Initialize the vector as follows:
3965      add/bit or/xor:    [init_val,0,0,...,0]
3966      mult/bit and:      [init_val,1,1,...,1]
3967      min/max/cond_expr: [init_val,init_val,...,init_val]
3968    and no adjustments are needed.
3969
3970    For example, for the following code:
3971
3972    s = init_val;
3973    for (i=0;i<n;i++)
3974      s = s + a[i];
3975
3976    STMT is 's = s + a[i]', and the reduction variable is 's'.
3977    For a vector of 4 units, we want to return either [0,0,0,init_val],
3978    or [0,0,0,0] and let the caller know that it needs to adjust
3979    the result at the end by 'init_val'.
3980
3981    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3982    initialization vector is simpler (same element in all entries), if
3983    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3984
3985    A cost model should help decide between these two schemes.  */
3986
3987 tree
3988 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3989                                tree *adjustment_def)
3990 {
3991   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3992   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3993   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3994   tree scalar_type = TREE_TYPE (init_val);
3995   tree vectype = get_vectype_for_scalar_type (scalar_type);
3996   enum tree_code code = gimple_assign_rhs_code (stmt);
3997   tree def_for_init;
3998   tree init_def;
3999   bool nested_in_vect_loop = false;
4000   REAL_VALUE_TYPE real_init_val = dconst0;
4001   int int_init_val = 0;
4002   gimple *def_stmt = NULL;
4003   gimple_seq stmts = NULL;
4004
4005   gcc_assert (vectype);
4006
4007   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4008               || SCALAR_FLOAT_TYPE_P (scalar_type));
4009
4010   if (nested_in_vect_loop_p (loop, stmt))
4011     nested_in_vect_loop = true;
4012   else
4013     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4014
4015   /* In case of double reduction we only create a vector variable to be put
4016      in the reduction phi node.  The actual statement creation is done in
4017      vect_create_epilog_for_reduction.  */
4018   if (adjustment_def && nested_in_vect_loop
4019       && TREE_CODE (init_val) == SSA_NAME
4020       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4021       && gimple_code (def_stmt) == GIMPLE_PHI
4022       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4023       && vinfo_for_stmt (def_stmt)
4024       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4025           == vect_double_reduction_def)
4026     {
4027       *adjustment_def = NULL;
4028       return vect_create_destination_var (init_val, vectype);
4029     }
4030
4031   /* In case of a nested reduction do not use an adjustment def as
4032      that case is not supported by the epilogue generation correctly
4033      if ncopies is not one.  */
4034   if (adjustment_def && nested_in_vect_loop)
4035     {
4036       *adjustment_def = NULL;
4037       return vect_get_vec_def_for_operand (init_val, stmt);
4038     }
4039
4040   switch (code)
4041     {
4042     case WIDEN_SUM_EXPR:
4043     case DOT_PROD_EXPR:
4044     case SAD_EXPR:
4045     case PLUS_EXPR:
4046     case MINUS_EXPR:
4047     case BIT_IOR_EXPR:
4048     case BIT_XOR_EXPR:
4049     case MULT_EXPR:
4050     case BIT_AND_EXPR:
4051       {
4052         /* ADJUSTMENT_DEF is NULL when called from
4053            vect_create_epilog_for_reduction to vectorize double reduction.  */
4054         if (adjustment_def)
4055           *adjustment_def = init_val;
4056
4057         if (code == MULT_EXPR)
4058           {
4059             real_init_val = dconst1;
4060             int_init_val = 1;
4061           }
4062
4063         if (code == BIT_AND_EXPR)
4064           int_init_val = -1;
4065
4066         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4067           def_for_init = build_real (scalar_type, real_init_val);
4068         else
4069           def_for_init = build_int_cst (scalar_type, int_init_val);
4070
4071         if (adjustment_def)
4072           /* Option1: the first element is '0' or '1' as well.  */
4073           init_def = gimple_build_vector_from_val (&stmts, vectype,
4074                                                    def_for_init);
4075         else
4076           {
4077             /* Option2: the first element is INIT_VAL.  */
4078             tree_vector_builder elts (vectype, 1, 2);
4079             elts.quick_push (init_val);
4080             elts.quick_push (def_for_init);
4081             init_def = gimple_build_vector (&stmts, &elts);
4082           }
4083       }
4084       break;
4085
4086     case MIN_EXPR:
4087     case MAX_EXPR:
4088     case COND_EXPR:
4089       {
4090         if (adjustment_def)
4091           {
4092             *adjustment_def = NULL_TREE;
4093             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4094               {
4095                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4096                 break;
4097               }
4098           }
4099         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4100         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4101       }
4102       break;
4103
4104     default:
4105       gcc_unreachable ();
4106     }
4107
4108   if (stmts)
4109     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4110   return init_def;
4111 }
4112
4113 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4114    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4115
4116 static void
4117 get_initial_defs_for_reduction (slp_tree slp_node,
4118                                 vec<tree> *vec_oprnds,
4119                                 unsigned int number_of_vectors,
4120                                 enum tree_code code, bool reduc_chain)
4121 {
4122   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4123   gimple *stmt = stmts[0];
4124   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4125   unsigned nunits;
4126   unsigned j, number_of_places_left_in_vector;
4127   tree vector_type, scalar_type;
4128   tree vop;
4129   int group_size = stmts.length ();
4130   unsigned int vec_num, i;
4131   unsigned number_of_copies = 1;
4132   vec<tree> voprnds;
4133   voprnds.create (number_of_vectors);
4134   tree neutral_op = NULL;
4135   struct loop *loop;
4136
4137   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4138   scalar_type = TREE_TYPE (vector_type);
4139   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4140
4141   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4142
4143   loop = (gimple_bb (stmt))->loop_father;
4144   gcc_assert (loop);
4145   edge pe = loop_preheader_edge (loop);
4146
4147   /* op is the reduction operand of the first stmt already.  */
4148   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4149      we need either neutral operands or the original operands.  See
4150      get_initial_def_for_reduction() for details.  */
4151   switch (code)
4152     {
4153     case WIDEN_SUM_EXPR:
4154     case DOT_PROD_EXPR:
4155     case SAD_EXPR:
4156     case PLUS_EXPR:
4157     case MINUS_EXPR:
4158     case BIT_IOR_EXPR:
4159     case BIT_XOR_EXPR:
4160       neutral_op = build_zero_cst (scalar_type);
4161       break;
4162
4163     case MULT_EXPR:
4164       neutral_op = build_one_cst (scalar_type);
4165       break;
4166
4167     case BIT_AND_EXPR:
4168       neutral_op = build_all_ones_cst (scalar_type);
4169       break;
4170
4171     /* For MIN/MAX we don't have an easy neutral operand but
4172        the initial values can be used fine here.  Only for
4173        a reduction chain we have to force a neutral element.  */
4174     case MAX_EXPR:
4175     case MIN_EXPR:
4176       if (! reduc_chain)
4177         neutral_op = NULL;
4178       else
4179         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4180       break;
4181
4182     default:
4183       gcc_assert (! reduc_chain);
4184       neutral_op = NULL;
4185     }
4186
4187   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4188      created vectors. It is greater than 1 if unrolling is performed.
4189
4190      For example, we have two scalar operands, s1 and s2 (e.g., group of
4191      strided accesses of size two), while NUNITS is four (i.e., four scalars
4192      of this type can be packed in a vector).  The output vector will contain
4193      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4194      will be 2).
4195
4196      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4197      containing the operands.
4198
4199      For example, NUNITS is four as before, and the group size is 8
4200      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4201      {s5, s6, s7, s8}.  */
4202
4203   number_of_copies = nunits * number_of_vectors / group_size;
4204
4205   number_of_places_left_in_vector = nunits;
4206   tree_vector_builder elts (vector_type, nunits, 1);
4207   elts.quick_grow (nunits);
4208   for (j = 0; j < number_of_copies; j++)
4209     {
4210       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4211         {
4212           tree op;
4213           /* Get the def before the loop.  In reduction chain we have only
4214              one initial value.  */
4215           if ((j != (number_of_copies - 1)
4216                || (reduc_chain && i != 0))
4217               && neutral_op)
4218             op = neutral_op;
4219           else
4220             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4221
4222           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4223           number_of_places_left_in_vector--;
4224           elts[number_of_places_left_in_vector] = op;
4225
4226           if (number_of_places_left_in_vector == 0)
4227             {
4228               gimple_seq ctor_seq = NULL;
4229               tree init = gimple_build_vector (&ctor_seq, &elts);
4230               if (ctor_seq != NULL)
4231                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4232               voprnds.quick_push (init);
4233
4234               number_of_places_left_in_vector = nunits;
4235               elts.new_vector (vector_type, nunits, 1);
4236               elts.quick_grow (nunits);
4237             }
4238         }
4239     }
4240
4241   /* Since the vectors are created in the reverse order, we should invert
4242      them.  */
4243   vec_num = voprnds.length ();
4244   for (j = vec_num; j != 0; j--)
4245     {
4246       vop = voprnds[j - 1];
4247       vec_oprnds->quick_push (vop);
4248     }
4249
4250   voprnds.release ();
4251
4252   /* In case that VF is greater than the unrolling factor needed for the SLP
4253      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4254      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4255      to replicate the vectors.  */
4256   tree neutral_vec = NULL;
4257   while (number_of_vectors > vec_oprnds->length ())
4258     {
4259       if (neutral_op)
4260         {
4261           if (!neutral_vec)
4262             {
4263               gimple_seq ctor_seq = NULL;
4264               neutral_vec = gimple_build_vector_from_val
4265                 (&ctor_seq, vector_type, neutral_op);
4266               if (ctor_seq != NULL)
4267                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4268             }
4269           vec_oprnds->quick_push (neutral_vec);
4270         }
4271       else
4272         {
4273           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4274             vec_oprnds->quick_push (vop);
4275         }
4276     }
4277 }
4278
4279
4280 /* Function vect_create_epilog_for_reduction
4281
4282    Create code at the loop-epilog to finalize the result of a reduction
4283    computation.
4284
4285    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4286      reduction statements.
4287    STMT is the scalar reduction stmt that is being vectorized.
4288    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4289      number of elements that we can fit in a vectype (nunits).  In this case
4290      we have to generate more than one vector stmt - i.e - we need to "unroll"
4291      the vector stmt by a factor VF/nunits.  For more details see documentation
4292      in vectorizable_operation.
4293    REDUC_FN is the internal function for the epilog reduction.
4294    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4295      computation.
4296    REDUC_INDEX is the index of the operand in the right hand side of the
4297      statement that is defined by REDUCTION_PHI.
4298    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4299    SLP_NODE is an SLP node containing a group of reduction statements. The
4300      first one in this group is STMT.
4301    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4302      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4303      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4304      any value of the IV in the loop.
4305    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4306
4307    This function:
4308    1. Creates the reduction def-use cycles: sets the arguments for
4309       REDUCTION_PHIS:
4310       The loop-entry argument is the vectorized initial-value of the reduction.
4311       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4312       sums.
4313    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4314       by calling the function specified by REDUC_FN if available, or by
4315       other means (whole-vector shifts or a scalar loop).
4316       The function also creates a new phi node at the loop exit to preserve
4317       loop-closed form, as illustrated below.
4318
4319      The flow at the entry to this function:
4320
4321         loop:
4322           vec_def = phi <null, null>            # REDUCTION_PHI
4323           VECT_DEF = vector_stmt                # vectorized form of STMT
4324           s_loop = scalar_stmt                  # (scalar) STMT
4325         loop_exit:
4326           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4327           use <s_out0>
4328           use <s_out0>
4329
4330      The above is transformed by this function into:
4331
4332         loop:
4333           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4334           VECT_DEF = vector_stmt                # vectorized form of STMT
4335           s_loop = scalar_stmt                  # (scalar) STMT
4336         loop_exit:
4337           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4338           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4339           v_out2 = reduce <v_out1>
4340           s_out3 = extract_field <v_out2, 0>
4341           s_out4 = adjust_result <s_out3>
4342           use <s_out4>
4343           use <s_out4>
4344 */
4345
4346 static void
4347 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4348                                   gimple *reduc_def_stmt,
4349                                   int ncopies, internal_fn reduc_fn,
4350                                   vec<gimple *> reduction_phis,
4351                                   bool double_reduc,
4352                                   slp_tree slp_node,
4353                                   slp_instance slp_node_instance,
4354                                   tree induc_val, enum tree_code induc_code)
4355 {
4356   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4357   stmt_vec_info prev_phi_info;
4358   tree vectype;
4359   machine_mode mode;
4360   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4361   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4362   basic_block exit_bb;
4363   tree scalar_dest;
4364   tree scalar_type;
4365   gimple *new_phi = NULL, *phi;
4366   gimple_stmt_iterator exit_gsi;
4367   tree vec_dest;
4368   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4369   gimple *epilog_stmt = NULL;
4370   enum tree_code code = gimple_assign_rhs_code (stmt);
4371   gimple *exit_phi;
4372   tree bitsize;
4373   tree adjustment_def = NULL;
4374   tree vec_initial_def = NULL;
4375   tree expr, def, initial_def = NULL;
4376   tree orig_name, scalar_result;
4377   imm_use_iterator imm_iter, phi_imm_iter;
4378   use_operand_p use_p, phi_use_p;
4379   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4380   bool nested_in_vect_loop = false;
4381   auto_vec<gimple *> new_phis;
4382   auto_vec<gimple *> inner_phis;
4383   enum vect_def_type dt = vect_unknown_def_type;
4384   int j, i;
4385   auto_vec<tree> scalar_results;
4386   unsigned int group_size = 1, k, ratio;
4387   auto_vec<tree> vec_initial_defs;
4388   auto_vec<gimple *> phis;
4389   bool slp_reduc = false;
4390   tree new_phi_result;
4391   gimple *inner_phi = NULL;
4392   tree induction_index = NULL_TREE;
4393
4394   if (slp_node)
4395     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4396
4397   if (nested_in_vect_loop_p (loop, stmt))
4398     {
4399       outer_loop = loop;
4400       loop = loop->inner;
4401       nested_in_vect_loop = true;
4402       gcc_assert (!slp_node);
4403     }
4404
4405   vectype = STMT_VINFO_VECTYPE (stmt_info);
4406   gcc_assert (vectype);
4407   mode = TYPE_MODE (vectype);
4408
4409   /* 1. Create the reduction def-use cycle:
4410      Set the arguments of REDUCTION_PHIS, i.e., transform
4411
4412         loop:
4413           vec_def = phi <null, null>            # REDUCTION_PHI
4414           VECT_DEF = vector_stmt                # vectorized form of STMT
4415           ...
4416
4417      into:
4418
4419         loop:
4420           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4421           VECT_DEF = vector_stmt                # vectorized form of STMT
4422           ...
4423
4424      (in case of SLP, do it for all the phis). */
4425
4426   /* Get the loop-entry arguments.  */
4427   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4428   if (slp_node)
4429     {
4430       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4431       vec_initial_defs.reserve (vec_num);
4432       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4433                                       &vec_initial_defs, vec_num, code,
4434                                       GROUP_FIRST_ELEMENT (stmt_info));
4435     }
4436   else
4437     {
4438       /* Get at the scalar def before the loop, that defines the initial value
4439          of the reduction variable.  */
4440       gimple *def_stmt;
4441       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4442                                            loop_preheader_edge (loop));
4443       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4444          and we can't use zero for induc_val, use initial_def.  Similarly
4445          for REDUC_MIN and initial_def larger than the base.  */
4446       if (TREE_CODE (initial_def) == INTEGER_CST
4447           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4448               == INTEGER_INDUC_COND_REDUCTION)
4449           && !integer_zerop (induc_val)
4450           && ((induc_code == MAX_EXPR
4451                && tree_int_cst_lt (initial_def, induc_val))
4452               || (induc_code == MIN_EXPR
4453                   && tree_int_cst_lt (induc_val, initial_def))))
4454         induc_val = initial_def;
4455       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4456       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4457                                                        &adjustment_def);
4458       vec_initial_defs.create (1);
4459       vec_initial_defs.quick_push (vec_initial_def);
4460     }
4461
4462   /* Set phi nodes arguments.  */
4463   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4464     {
4465       tree vec_init_def = vec_initial_defs[i];
4466       tree def = vect_defs[i];
4467       for (j = 0; j < ncopies; j++)
4468         {
4469           if (j != 0)
4470             {
4471               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4472               if (nested_in_vect_loop)
4473                 vec_init_def
4474                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4475                                                     vec_init_def);
4476             }
4477
4478           /* Set the loop-entry arg of the reduction-phi.  */
4479
4480           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4481               == INTEGER_INDUC_COND_REDUCTION)
4482             {
4483               /* Initialise the reduction phi to zero.  This prevents initial
4484                  values of non-zero interferring with the reduction op.  */
4485               gcc_assert (ncopies == 1);
4486               gcc_assert (i == 0);
4487
4488               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4489               tree induc_val_vec
4490                 = build_vector_from_val (vec_init_def_type, induc_val);
4491
4492               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4493                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4494             }
4495           else
4496             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4497                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4498
4499           /* Set the loop-latch arg for the reduction-phi.  */
4500           if (j > 0)
4501             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4502
4503           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4504                        UNKNOWN_LOCATION);
4505
4506           if (dump_enabled_p ())
4507             {
4508               dump_printf_loc (MSG_NOTE, vect_location,
4509                                "transform reduction: created def-use cycle: ");
4510               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4511               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4512             }
4513         }
4514     }
4515
4516   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4517      which is updated with the current index of the loop for every match of
4518      the original loop's cond_expr (VEC_STMT).  This results in a vector
4519      containing the last time the condition passed for that vector lane.
4520      The first match will be a 1 to allow 0 to be used for non-matching
4521      indexes.  If there are no matches at all then the vector will be all
4522      zeroes.  */
4523   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4524     {
4525       tree indx_before_incr, indx_after_incr;
4526       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4527       int k;
4528
4529       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4530       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4531
4532       int scalar_precision
4533         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4534       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4535       tree cr_index_vector_type = build_vector_type
4536         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4537
4538       /* First we create a simple vector induction variable which starts
4539          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4540          vector size (STEP).  */
4541
4542       /* Create a {1,2,3,...} vector.  */
4543       tree_vector_builder vtemp (cr_index_vector_type, 1, 3);
4544       for (k = 0; k < 3; ++k)
4545         vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4546       tree series_vect = vtemp.build ();
4547
4548       /* Create a vector of the step value.  */
4549       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4550       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4551
4552       /* Create an induction variable.  */
4553       gimple_stmt_iterator incr_gsi;
4554       bool insert_after;
4555       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4556       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4557                  insert_after, &indx_before_incr, &indx_after_incr);
4558
4559       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4560          filled with zeros (VEC_ZERO).  */
4561
4562       /* Create a vector of 0s.  */
4563       tree zero = build_zero_cst (cr_index_scalar_type);
4564       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4565
4566       /* Create a vector phi node.  */
4567       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4568       new_phi = create_phi_node (new_phi_tree, loop->header);
4569       set_vinfo_for_stmt (new_phi,
4570                           new_stmt_vec_info (new_phi, loop_vinfo));
4571       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4572                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4573
4574       /* Now take the condition from the loops original cond_expr
4575          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4576          every match uses values from the induction variable
4577          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4578          (NEW_PHI_TREE).
4579          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4580          the new cond_expr (INDEX_COND_EXPR).  */
4581
4582       /* Duplicate the condition from vec_stmt.  */
4583       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4584
4585       /* Create a conditional, where the condition is taken from vec_stmt
4586          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4587          else is the phi (NEW_PHI_TREE).  */
4588       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4589                                      ccompare, indx_before_incr,
4590                                      new_phi_tree);
4591       induction_index = make_ssa_name (cr_index_vector_type);
4592       gimple *index_condition = gimple_build_assign (induction_index,
4593                                                      index_cond_expr);
4594       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4595       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4596                                                         loop_vinfo);
4597       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4598       set_vinfo_for_stmt (index_condition, index_vec_info);
4599
4600       /* Update the phi with the vec cond.  */
4601       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4602                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4603     }
4604
4605   /* 2. Create epilog code.
4606         The reduction epilog code operates across the elements of the vector
4607         of partial results computed by the vectorized loop.
4608         The reduction epilog code consists of:
4609
4610         step 1: compute the scalar result in a vector (v_out2)
4611         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4612         step 3: adjust the scalar result (s_out3) if needed.
4613
4614         Step 1 can be accomplished using one the following three schemes:
4615           (scheme 1) using reduc_fn, if available.
4616           (scheme 2) using whole-vector shifts, if available.
4617           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4618                      combined.
4619
4620           The overall epilog code looks like this:
4621
4622           s_out0 = phi <s_loop>         # original EXIT_PHI
4623           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4624           v_out2 = reduce <v_out1>              # step 1
4625           s_out3 = extract_field <v_out2, 0>    # step 2
4626           s_out4 = adjust_result <s_out3>       # step 3
4627
4628           (step 3 is optional, and steps 1 and 2 may be combined).
4629           Lastly, the uses of s_out0 are replaced by s_out4.  */
4630
4631
4632   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4633          v_out1 = phi <VECT_DEF>
4634          Store them in NEW_PHIS.  */
4635
4636   exit_bb = single_exit (loop)->dest;
4637   prev_phi_info = NULL;
4638   new_phis.create (vect_defs.length ());
4639   FOR_EACH_VEC_ELT (vect_defs, i, def)
4640     {
4641       for (j = 0; j < ncopies; j++)
4642         {
4643           tree new_def = copy_ssa_name (def);
4644           phi = create_phi_node (new_def, exit_bb);
4645           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4646           if (j == 0)
4647             new_phis.quick_push (phi);
4648           else
4649             {
4650               def = vect_get_vec_def_for_stmt_copy (dt, def);
4651               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4652             }
4653
4654           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4655           prev_phi_info = vinfo_for_stmt (phi);
4656         }
4657     }
4658
4659   /* The epilogue is created for the outer-loop, i.e., for the loop being
4660      vectorized.  Create exit phis for the outer loop.  */
4661   if (double_reduc)
4662     {
4663       loop = outer_loop;
4664       exit_bb = single_exit (loop)->dest;
4665       inner_phis.create (vect_defs.length ());
4666       FOR_EACH_VEC_ELT (new_phis, i, phi)
4667         {
4668           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4669           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4670           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4671                            PHI_RESULT (phi));
4672           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4673                                                             loop_vinfo));
4674           inner_phis.quick_push (phi);
4675           new_phis[i] = outer_phi;
4676           prev_phi_info = vinfo_for_stmt (outer_phi);
4677           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4678             {
4679               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4680               new_result = copy_ssa_name (PHI_RESULT (phi));
4681               outer_phi = create_phi_node (new_result, exit_bb);
4682               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4683                                PHI_RESULT (phi));
4684               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4685                                                                 loop_vinfo));
4686               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4687               prev_phi_info = vinfo_for_stmt (outer_phi);
4688             }
4689         }
4690     }
4691
4692   exit_gsi = gsi_after_labels (exit_bb);
4693
4694   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4695          (i.e. when reduc_fn is not available) and in the final adjustment
4696          code (if needed).  Also get the original scalar reduction variable as
4697          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4698          represents a reduction pattern), the tree-code and scalar-def are
4699          taken from the original stmt that the pattern-stmt (STMT) replaces.
4700          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4701          are taken from STMT.  */
4702
4703   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4704   if (!orig_stmt)
4705     {
4706       /* Regular reduction  */
4707       orig_stmt = stmt;
4708     }
4709   else
4710     {
4711       /* Reduction pattern  */
4712       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4713       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4714       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4715     }
4716
4717   code = gimple_assign_rhs_code (orig_stmt);
4718   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4719      partial results are added and not subtracted.  */
4720   if (code == MINUS_EXPR)
4721     code = PLUS_EXPR;
4722
4723   scalar_dest = gimple_assign_lhs (orig_stmt);
4724   scalar_type = TREE_TYPE (scalar_dest);
4725   scalar_results.create (group_size);
4726   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4727   bitsize = TYPE_SIZE (scalar_type);
4728
4729   /* In case this is a reduction in an inner-loop while vectorizing an outer
4730      loop - we don't need to extract a single scalar result at the end of the
4731      inner-loop (unless it is double reduction, i.e., the use of reduction is
4732      outside the outer-loop).  The final vector of partial results will be used
4733      in the vectorized outer-loop, or reduced to a scalar result at the end of
4734      the outer-loop.  */
4735   if (nested_in_vect_loop && !double_reduc)
4736     goto vect_finalize_reduction;
4737
4738   /* SLP reduction without reduction chain, e.g.,
4739      # a1 = phi <a2, a0>
4740      # b1 = phi <b2, b0>
4741      a2 = operation (a1)
4742      b2 = operation (b1)  */
4743   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4744
4745   /* In case of reduction chain, e.g.,
4746      # a1 = phi <a3, a0>
4747      a2 = operation (a1)
4748      a3 = operation (a2),
4749
4750      we may end up with more than one vector result.  Here we reduce them to
4751      one vector.  */
4752   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4753     {
4754       tree first_vect = PHI_RESULT (new_phis[0]);
4755       gassign *new_vec_stmt = NULL;
4756       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4757       for (k = 1; k < new_phis.length (); k++)
4758         {
4759           gimple *next_phi = new_phis[k];
4760           tree second_vect = PHI_RESULT (next_phi);
4761           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4762           new_vec_stmt = gimple_build_assign (tem, code,
4763                                               first_vect, second_vect);
4764           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4765           first_vect = tem;
4766         }
4767
4768       new_phi_result = first_vect;
4769       if (new_vec_stmt)
4770         {
4771           new_phis.truncate (0);
4772           new_phis.safe_push (new_vec_stmt);
4773         }
4774     }
4775   /* Likewise if we couldn't use a single defuse cycle.  */
4776   else if (ncopies > 1)
4777     {
4778       gcc_assert (new_phis.length () == 1);
4779       tree first_vect = PHI_RESULT (new_phis[0]);
4780       gassign *new_vec_stmt = NULL;
4781       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4782       gimple *next_phi = new_phis[0];
4783       for (int k = 1; k < ncopies; ++k)
4784         {
4785           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4786           tree second_vect = PHI_RESULT (next_phi);
4787           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4788           new_vec_stmt = gimple_build_assign (tem, code,
4789                                               first_vect, second_vect);
4790           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4791           first_vect = tem;
4792         }
4793       new_phi_result = first_vect;
4794       new_phis.truncate (0);
4795       new_phis.safe_push (new_vec_stmt);
4796     }
4797   else
4798     new_phi_result = PHI_RESULT (new_phis[0]);
4799
4800   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4801       && reduc_fn != IFN_LAST)
4802     {
4803       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4804          various data values where the condition matched and another vector
4805          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4806          need to extract the last matching index (which will be the index with
4807          highest value) and use this to index into the data vector.
4808          For the case where there were no matches, the data vector will contain
4809          all default values and the index vector will be all zeros.  */
4810
4811       /* Get various versions of the type of the vector of indexes.  */
4812       tree index_vec_type = TREE_TYPE (induction_index);
4813       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4814       tree index_scalar_type = TREE_TYPE (index_vec_type);
4815       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4816         (index_vec_type);
4817
4818       /* Get an unsigned integer version of the type of the data vector.  */
4819       int scalar_precision
4820         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4821       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4822       tree vectype_unsigned = build_vector_type
4823         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4824
4825       /* First we need to create a vector (ZERO_VEC) of zeros and another
4826          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4827          can create using a MAX reduction and then expanding.
4828          In the case where the loop never made any matches, the max index will
4829          be zero.  */
4830
4831       /* Vector of {0, 0, 0,...}.  */
4832       tree zero_vec = make_ssa_name (vectype);
4833       tree zero_vec_rhs = build_zero_cst (vectype);
4834       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4835       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4836
4837       /* Find maximum value from the vector of found indexes.  */
4838       tree max_index = make_ssa_name (index_scalar_type);
4839       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4840                                                           1, induction_index);
4841       gimple_call_set_lhs (max_index_stmt, max_index);
4842       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4843
4844       /* Vector of {max_index, max_index, max_index,...}.  */
4845       tree max_index_vec = make_ssa_name (index_vec_type);
4846       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4847                                                       max_index);
4848       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4849                                                         max_index_vec_rhs);
4850       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4851
4852       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4853          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4854          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4855          otherwise.  Only one value should match, resulting in a vector
4856          (VEC_COND) with one data value and the rest zeros.
4857          In the case where the loop never made any matches, every index will
4858          match, resulting in a vector with all data values (which will all be
4859          the default value).  */
4860
4861       /* Compare the max index vector to the vector of found indexes to find
4862          the position of the max value.  */
4863       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4864       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4865                                                       induction_index,
4866                                                       max_index_vec);
4867       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4868
4869       /* Use the compare to choose either values from the data vector or
4870          zero.  */
4871       tree vec_cond = make_ssa_name (vectype);
4872       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4873                                                    vec_compare, new_phi_result,
4874                                                    zero_vec);
4875       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4876
4877       /* Finally we need to extract the data value from the vector (VEC_COND)
4878          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4879          reduction, but because this doesn't exist, we can use a MAX reduction
4880          instead.  The data value might be signed or a float so we need to cast
4881          it first.
4882          In the case where the loop never made any matches, the data values are
4883          all identical, and so will reduce down correctly.  */
4884
4885       /* Make the matched data values unsigned.  */
4886       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4887       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4888                                        vec_cond);
4889       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4890                                                         VIEW_CONVERT_EXPR,
4891                                                         vec_cond_cast_rhs);
4892       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4893
4894       /* Reduce down to a scalar value.  */
4895       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4896       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4897                                                            1, vec_cond_cast);
4898       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4899       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4900
4901       /* Convert the reduced value back to the result type and set as the
4902          result.  */
4903       gimple_seq stmts = NULL;
4904       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4905                                data_reduc);
4906       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4907       scalar_results.safe_push (new_temp);
4908     }
4909   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4910            && reduc_fn == IFN_LAST)
4911     {
4912       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4913          idx = 0;
4914          idx_val = induction_index[0];
4915          val = data_reduc[0];
4916          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4917            if (induction_index[i] > idx_val)
4918              val = data_reduc[i], idx_val = induction_index[i];
4919          return val;  */
4920
4921       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4922       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4923       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4924       unsigned HOST_WIDE_INT v_size
4925         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4926       tree idx_val = NULL_TREE, val = NULL_TREE;
4927       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4928         {
4929           tree old_idx_val = idx_val;
4930           tree old_val = val;
4931           idx_val = make_ssa_name (idx_eltype);
4932           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4933                                              build3 (BIT_FIELD_REF, idx_eltype,
4934                                                      induction_index,
4935                                                      bitsize_int (el_size),
4936                                                      bitsize_int (off)));
4937           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4938           val = make_ssa_name (data_eltype);
4939           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4940                                              build3 (BIT_FIELD_REF,
4941                                                      data_eltype,
4942                                                      new_phi_result,
4943                                                      bitsize_int (el_size),
4944                                                      bitsize_int (off)));
4945           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4946           if (off != 0)
4947             {
4948               tree new_idx_val = idx_val;
4949               tree new_val = val;
4950               if (off != v_size - el_size)
4951                 {
4952                   new_idx_val = make_ssa_name (idx_eltype);
4953                   epilog_stmt = gimple_build_assign (new_idx_val,
4954                                                      MAX_EXPR, idx_val,
4955                                                      old_idx_val);
4956                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4957                 }
4958               new_val = make_ssa_name (data_eltype);
4959               epilog_stmt = gimple_build_assign (new_val,
4960                                                  COND_EXPR,
4961                                                  build2 (GT_EXPR,
4962                                                          boolean_type_node,
4963                                                          idx_val,
4964                                                          old_idx_val),
4965                                                  val, old_val);
4966               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4967               idx_val = new_idx_val;
4968               val = new_val;
4969             }
4970         }
4971       /* Convert the reduced value back to the result type and set as the
4972          result.  */
4973       gimple_seq stmts = NULL;
4974       val = gimple_convert (&stmts, scalar_type, val);
4975       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4976       scalar_results.safe_push (val);
4977     }
4978
4979   /* 2.3 Create the reduction code, using one of the three schemes described
4980          above. In SLP we simply need to extract all the elements from the
4981          vector (without reducing them), so we use scalar shifts.  */
4982   else if (reduc_fn != IFN_LAST && !slp_reduc)
4983     {
4984       tree tmp;
4985       tree vec_elem_type;
4986
4987       /* Case 1:  Create:
4988          v_out2 = reduc_expr <v_out1>  */
4989
4990       if (dump_enabled_p ())
4991         dump_printf_loc (MSG_NOTE, vect_location,
4992                          "Reduce using direct vector reduction.\n");
4993
4994       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4995       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4996         {
4997           tree tmp_dest
4998             = vect_create_destination_var (scalar_dest, vec_elem_type);
4999           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5000                                                     new_phi_result);
5001           gimple_set_lhs (epilog_stmt, tmp_dest);
5002           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5003           gimple_set_lhs (epilog_stmt, new_temp);
5004           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5005
5006           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5007                                              new_temp);
5008         }
5009       else
5010         {
5011           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5012                                                     new_phi_result);
5013           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5014         }
5015
5016       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5017       gimple_set_lhs (epilog_stmt, new_temp);
5018       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5019
5020       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5021            == INTEGER_INDUC_COND_REDUCTION)
5022           && !operand_equal_p (initial_def, induc_val, 0))
5023         {
5024           /* Earlier we set the initial value to be a vector if induc_val
5025              values.  Check the result and if it is induc_val then replace
5026              with the original initial value, unless induc_val is
5027              the same as initial_def already.  */
5028           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5029                                   induc_val);
5030
5031           tmp = make_ssa_name (new_scalar_dest);
5032           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5033                                              initial_def, new_temp);
5034           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5035           new_temp = tmp;
5036         }
5037
5038       scalar_results.safe_push (new_temp);
5039     }
5040   else
5041     {
5042       bool reduce_with_shift = have_whole_vector_shift (mode);
5043       int element_bitsize = tree_to_uhwi (bitsize);
5044       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5045       tree vec_temp;
5046
5047       /* COND reductions all do the final reduction with MAX_EXPR
5048          or MIN_EXPR.  */
5049       if (code == COND_EXPR)
5050         {
5051           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5052               == INTEGER_INDUC_COND_REDUCTION)
5053             code = induc_code;
5054           else
5055             code = MAX_EXPR;
5056         }
5057
5058       /* Regardless of whether we have a whole vector shift, if we're
5059          emulating the operation via tree-vect-generic, we don't want
5060          to use it.  Only the first round of the reduction is likely
5061          to still be profitable via emulation.  */
5062       /* ??? It might be better to emit a reduction tree code here, so that
5063          tree-vect-generic can expand the first round via bit tricks.  */
5064       if (!VECTOR_MODE_P (mode))
5065         reduce_with_shift = false;
5066       else
5067         {
5068           optab optab = optab_for_tree_code (code, vectype, optab_default);
5069           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5070             reduce_with_shift = false;
5071         }
5072
5073       if (reduce_with_shift && !slp_reduc)
5074         {
5075           int nelements = vec_size_in_bits / element_bitsize;
5076           vec_perm_builder sel;
5077           vec_perm_indices indices;
5078
5079           int elt_offset;
5080
5081           tree zero_vec = build_zero_cst (vectype);
5082           /* Case 2: Create:
5083              for (offset = nelements/2; offset >= 1; offset/=2)
5084                 {
5085                   Create:  va' = vec_shift <va, offset>
5086                   Create:  va = vop <va, va'>
5087                 }  */
5088
5089           tree rhs;
5090
5091           if (dump_enabled_p ())
5092             dump_printf_loc (MSG_NOTE, vect_location,
5093                              "Reduce using vector shifts\n");
5094
5095           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5096           new_temp = new_phi_result;
5097           for (elt_offset = nelements / 2;
5098                elt_offset >= 1;
5099                elt_offset /= 2)
5100             {
5101               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5102               indices.new_vector (sel, 2, nelements);
5103               tree mask = vect_gen_perm_mask_any (vectype, indices);
5104               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5105                                                  new_temp, zero_vec, mask);
5106               new_name = make_ssa_name (vec_dest, epilog_stmt);
5107               gimple_assign_set_lhs (epilog_stmt, new_name);
5108               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5109
5110               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5111                                                  new_temp);
5112               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5113               gimple_assign_set_lhs (epilog_stmt, new_temp);
5114               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5115             }
5116
5117           /* 2.4  Extract the final scalar result.  Create:
5118              s_out3 = extract_field <v_out2, bitpos>  */
5119
5120           if (dump_enabled_p ())
5121             dump_printf_loc (MSG_NOTE, vect_location,
5122                              "extract scalar result\n");
5123
5124           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5125                         bitsize, bitsize_zero_node);
5126           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5127           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5128           gimple_assign_set_lhs (epilog_stmt, new_temp);
5129           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5130           scalar_results.safe_push (new_temp);
5131         }
5132       else
5133         {
5134           /* Case 3: Create:
5135              s = extract_field <v_out2, 0>
5136              for (offset = element_size;
5137                   offset < vector_size;
5138                   offset += element_size;)
5139                {
5140                  Create:  s' = extract_field <v_out2, offset>
5141                  Create:  s = op <s, s'>  // For non SLP cases
5142                }  */
5143
5144           if (dump_enabled_p ())
5145             dump_printf_loc (MSG_NOTE, vect_location,
5146                              "Reduce using scalar code.\n");
5147
5148           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5149           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5150             {
5151               int bit_offset;
5152               if (gimple_code (new_phi) == GIMPLE_PHI)
5153                 vec_temp = PHI_RESULT (new_phi);
5154               else
5155                 vec_temp = gimple_assign_lhs (new_phi);
5156               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5157                                  bitsize_zero_node);
5158               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5159               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5160               gimple_assign_set_lhs (epilog_stmt, new_temp);
5161               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5162
5163               /* In SLP we don't need to apply reduction operation, so we just
5164                  collect s' values in SCALAR_RESULTS.  */
5165               if (slp_reduc)
5166                 scalar_results.safe_push (new_temp);
5167
5168               for (bit_offset = element_bitsize;
5169                    bit_offset < vec_size_in_bits;
5170                    bit_offset += element_bitsize)
5171                 {
5172                   tree bitpos = bitsize_int (bit_offset);
5173                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5174                                      bitsize, bitpos);
5175
5176                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5177                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5178                   gimple_assign_set_lhs (epilog_stmt, new_name);
5179                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5180
5181                   if (slp_reduc)
5182                     {
5183                       /* In SLP we don't need to apply reduction operation, so
5184                          we just collect s' values in SCALAR_RESULTS.  */
5185                       new_temp = new_name;
5186                       scalar_results.safe_push (new_name);
5187                     }
5188                   else
5189                     {
5190                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5191                                                          new_name, new_temp);
5192                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5193                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5194                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5195                     }
5196                 }
5197             }
5198
5199           /* The only case where we need to reduce scalar results in SLP, is
5200              unrolling.  If the size of SCALAR_RESULTS is greater than
5201              GROUP_SIZE, we reduce them combining elements modulo
5202              GROUP_SIZE.  */
5203           if (slp_reduc)
5204             {
5205               tree res, first_res, new_res;
5206               gimple *new_stmt;
5207
5208               /* Reduce multiple scalar results in case of SLP unrolling.  */
5209               for (j = group_size; scalar_results.iterate (j, &res);
5210                    j++)
5211                 {
5212                   first_res = scalar_results[j % group_size];
5213                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5214                                                   first_res, res);
5215                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5216                   gimple_assign_set_lhs (new_stmt, new_res);
5217                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5218                   scalar_results[j % group_size] = new_res;
5219                 }
5220             }
5221           else
5222             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5223             scalar_results.safe_push (new_temp);
5224         }
5225
5226       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5227            == INTEGER_INDUC_COND_REDUCTION)
5228           && !operand_equal_p (initial_def, induc_val, 0))
5229         {
5230           /* Earlier we set the initial value to be a vector if induc_val
5231              values.  Check the result and if it is induc_val then replace
5232              with the original initial value, unless induc_val is
5233              the same as initial_def already.  */
5234           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5235                                   induc_val);
5236
5237           tree tmp = make_ssa_name (new_scalar_dest);
5238           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5239                                              initial_def, new_temp);
5240           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5241           scalar_results[0] = tmp;
5242         }
5243     }
5244
5245 vect_finalize_reduction:
5246
5247   if (double_reduc)
5248     loop = loop->inner;
5249
5250   /* 2.5 Adjust the final result by the initial value of the reduction
5251          variable. (When such adjustment is not needed, then
5252          'adjustment_def' is zero).  For example, if code is PLUS we create:
5253          new_temp = loop_exit_def + adjustment_def  */
5254
5255   if (adjustment_def)
5256     {
5257       gcc_assert (!slp_reduc);
5258       if (nested_in_vect_loop)
5259         {
5260           new_phi = new_phis[0];
5261           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5262           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5263           new_dest = vect_create_destination_var (scalar_dest, vectype);
5264         }
5265       else
5266         {
5267           new_temp = scalar_results[0];
5268           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5269           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5270           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5271         }
5272
5273       epilog_stmt = gimple_build_assign (new_dest, expr);
5274       new_temp = make_ssa_name (new_dest, epilog_stmt);
5275       gimple_assign_set_lhs (epilog_stmt, new_temp);
5276       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5277       if (nested_in_vect_loop)
5278         {
5279           set_vinfo_for_stmt (epilog_stmt,
5280                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5281           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5282                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5283
5284           if (!double_reduc)
5285             scalar_results.quick_push (new_temp);
5286           else
5287             scalar_results[0] = new_temp;
5288         }
5289       else
5290         scalar_results[0] = new_temp;
5291
5292       new_phis[0] = epilog_stmt;
5293     }
5294
5295   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5296           phis with new adjusted scalar results, i.e., replace use <s_out0>
5297           with use <s_out4>.
5298
5299      Transform:
5300         loop_exit:
5301           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5302           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5303           v_out2 = reduce <v_out1>
5304           s_out3 = extract_field <v_out2, 0>
5305           s_out4 = adjust_result <s_out3>
5306           use <s_out0>
5307           use <s_out0>
5308
5309      into:
5310
5311         loop_exit:
5312           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5313           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5314           v_out2 = reduce <v_out1>
5315           s_out3 = extract_field <v_out2, 0>
5316           s_out4 = adjust_result <s_out3>
5317           use <s_out4>
5318           use <s_out4> */
5319
5320
5321   /* In SLP reduction chain we reduce vector results into one vector if
5322      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5323      the last stmt in the reduction chain, since we are looking for the loop
5324      exit phi node.  */
5325   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5326     {
5327       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5328       /* Handle reduction patterns.  */
5329       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5330         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5331
5332       scalar_dest = gimple_assign_lhs (dest_stmt);
5333       group_size = 1;
5334     }
5335
5336   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5337      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5338      need to match SCALAR_RESULTS with corresponding statements.  The first
5339      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5340      the first vector stmt, etc.
5341      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5342   if (group_size > new_phis.length ())
5343     {
5344       ratio = group_size / new_phis.length ();
5345       gcc_assert (!(group_size % new_phis.length ()));
5346     }
5347   else
5348     ratio = 1;
5349
5350   for (k = 0; k < group_size; k++)
5351     {
5352       if (k % ratio == 0)
5353         {
5354           epilog_stmt = new_phis[k / ratio];
5355           reduction_phi = reduction_phis[k / ratio];
5356           if (double_reduc)
5357             inner_phi = inner_phis[k / ratio];
5358         }
5359
5360       if (slp_reduc)
5361         {
5362           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5363
5364           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5365           /* SLP statements can't participate in patterns.  */
5366           gcc_assert (!orig_stmt);
5367           scalar_dest = gimple_assign_lhs (current_stmt);
5368         }
5369
5370       phis.create (3);
5371       /* Find the loop-closed-use at the loop exit of the original scalar
5372          result.  (The reduction result is expected to have two immediate uses -
5373          one at the latch block, and one at the loop exit).  */
5374       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5375         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5376             && !is_gimple_debug (USE_STMT (use_p)))
5377           phis.safe_push (USE_STMT (use_p));
5378
5379       /* While we expect to have found an exit_phi because of loop-closed-ssa
5380          form we can end up without one if the scalar cycle is dead.  */
5381
5382       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5383         {
5384           if (outer_loop)
5385             {
5386               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5387               gphi *vect_phi;
5388
5389               /* FORNOW. Currently not supporting the case that an inner-loop
5390                  reduction is not used in the outer-loop (but only outside the
5391                  outer-loop), unless it is double reduction.  */
5392               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5393                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5394                           || double_reduc);
5395
5396               if (double_reduc)
5397                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5398               else
5399                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5400               if (!double_reduc
5401                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5402                       != vect_double_reduction_def)
5403                 continue;
5404
5405               /* Handle double reduction:
5406
5407                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5408                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5409                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5410                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5411
5412                  At that point the regular reduction (stmt2 and stmt3) is
5413                  already vectorized, as well as the exit phi node, stmt4.
5414                  Here we vectorize the phi node of double reduction, stmt1, and
5415                  update all relevant statements.  */
5416
5417               /* Go through all the uses of s2 to find double reduction phi
5418                  node, i.e., stmt1 above.  */
5419               orig_name = PHI_RESULT (exit_phi);
5420               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5421                 {
5422                   stmt_vec_info use_stmt_vinfo;
5423                   stmt_vec_info new_phi_vinfo;
5424                   tree vect_phi_init, preheader_arg, vect_phi_res;
5425                   basic_block bb = gimple_bb (use_stmt);
5426                   gimple *use;
5427
5428                   /* Check that USE_STMT is really double reduction phi
5429                      node.  */
5430                   if (gimple_code (use_stmt) != GIMPLE_PHI
5431                       || gimple_phi_num_args (use_stmt) != 2
5432                       || bb->loop_father != outer_loop)
5433                     continue;
5434                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5435                   if (!use_stmt_vinfo
5436                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5437                           != vect_double_reduction_def)
5438                     continue;
5439
5440                   /* Create vector phi node for double reduction:
5441                      vs1 = phi <vs0, vs2>
5442                      vs1 was created previously in this function by a call to
5443                        vect_get_vec_def_for_operand and is stored in
5444                        vec_initial_def;
5445                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5446                      vs0 is created here.  */
5447
5448                   /* Create vector phi node.  */
5449                   vect_phi = create_phi_node (vec_initial_def, bb);
5450                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5451                                     loop_vec_info_for_loop (outer_loop));
5452                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5453
5454                   /* Create vs0 - initial def of the double reduction phi.  */
5455                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5456                                              loop_preheader_edge (outer_loop));
5457                   vect_phi_init = get_initial_def_for_reduction
5458                     (stmt, preheader_arg, NULL);
5459
5460                   /* Update phi node arguments with vs0 and vs2.  */
5461                   add_phi_arg (vect_phi, vect_phi_init,
5462                                loop_preheader_edge (outer_loop),
5463                                UNKNOWN_LOCATION);
5464                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5465                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5466                   if (dump_enabled_p ())
5467                     {
5468                       dump_printf_loc (MSG_NOTE, vect_location,
5469                                        "created double reduction phi node: ");
5470                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5471                     }
5472
5473                   vect_phi_res = PHI_RESULT (vect_phi);
5474
5475                   /* Replace the use, i.e., set the correct vs1 in the regular
5476                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5477                      loop is redundant.  */
5478                   use = reduction_phi;
5479                   for (j = 0; j < ncopies; j++)
5480                     {
5481                       edge pr_edge = loop_preheader_edge (loop);
5482                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5483                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5484                     }
5485                 }
5486             }
5487         }
5488
5489       phis.release ();
5490       if (nested_in_vect_loop)
5491         {
5492           if (double_reduc)
5493             loop = outer_loop;
5494           else
5495             continue;
5496         }
5497
5498       phis.create (3);
5499       /* Find the loop-closed-use at the loop exit of the original scalar
5500          result.  (The reduction result is expected to have two immediate uses,
5501          one at the latch block, and one at the loop exit).  For double
5502          reductions we are looking for exit phis of the outer loop.  */
5503       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5504         {
5505           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5506             {
5507               if (!is_gimple_debug (USE_STMT (use_p)))
5508                 phis.safe_push (USE_STMT (use_p));
5509             }
5510           else
5511             {
5512               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5513                 {
5514                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5515
5516                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5517                     {
5518                       if (!flow_bb_inside_loop_p (loop,
5519                                              gimple_bb (USE_STMT (phi_use_p)))
5520                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5521                         phis.safe_push (USE_STMT (phi_use_p));
5522                     }
5523                 }
5524             }
5525         }
5526
5527       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5528         {
5529           /* Replace the uses:  */
5530           orig_name = PHI_RESULT (exit_phi);
5531           scalar_result = scalar_results[k];
5532           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5533             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5534               SET_USE (use_p, scalar_result);
5535         }
5536
5537       phis.release ();
5538     }
5539 }
5540
5541
5542 /* Function is_nonwrapping_integer_induction.
5543
5544    Check if STMT (which is part of loop LOOP) both increments and
5545    does not cause overflow.  */
5546
5547 static bool
5548 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5549 {
5550   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5551   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5552   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5553   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5554   widest_int ni, max_loop_value, lhs_max;
5555   bool overflow = false;
5556
5557   /* Make sure the loop is integer based.  */
5558   if (TREE_CODE (base) != INTEGER_CST
5559       || TREE_CODE (step) != INTEGER_CST)
5560     return false;
5561
5562   /* Check that the max size of the loop will not wrap.  */
5563
5564   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5565     return true;
5566
5567   if (! max_stmt_executions (loop, &ni))
5568     return false;
5569
5570   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5571                             &overflow);
5572   if (overflow)
5573     return false;
5574
5575   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5576                             TYPE_SIGN (lhs_type), &overflow);
5577   if (overflow)
5578     return false;
5579
5580   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5581           <= TYPE_PRECISION (lhs_type));
5582 }
5583
5584 /* Function vectorizable_reduction.
5585
5586    Check if STMT performs a reduction operation that can be vectorized.
5587    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5588    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5589    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5590
5591    This function also handles reduction idioms (patterns) that have been
5592    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5593    of this form:
5594      X = pattern_expr (arg0, arg1, ..., X)
5595    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5596    sequence that had been detected and replaced by the pattern-stmt (STMT).
5597
5598    This function also handles reduction of condition expressions, for example:
5599      for (int i = 0; i < N; i++)
5600        if (a[i] < value)
5601          last = a[i];
5602    This is handled by vectorising the loop and creating an additional vector
5603    containing the loop indexes for which "a[i] < value" was true.  In the
5604    function epilogue this is reduced to a single max value and then used to
5605    index into the vector of results.
5606
5607    In some cases of reduction patterns, the type of the reduction variable X is
5608    different than the type of the other arguments of STMT.
5609    In such cases, the vectype that is used when transforming STMT into a vector
5610    stmt is different than the vectype that is used to determine the
5611    vectorization factor, because it consists of a different number of elements
5612    than the actual number of elements that are being operated upon in parallel.
5613
5614    For example, consider an accumulation of shorts into an int accumulator.
5615    On some targets it's possible to vectorize this pattern operating on 8
5616    shorts at a time (hence, the vectype for purposes of determining the
5617    vectorization factor should be V8HI); on the other hand, the vectype that
5618    is used to create the vector form is actually V4SI (the type of the result).
5619
5620    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5621    indicates what is the actual level of parallelism (V8HI in the example), so
5622    that the right vectorization factor would be derived.  This vectype
5623    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5624    be used to create the vectorized stmt.  The right vectype for the vectorized
5625    stmt is obtained from the type of the result X:
5626         get_vectype_for_scalar_type (TREE_TYPE (X))
5627
5628    This means that, contrary to "regular" reductions (or "regular" stmts in
5629    general), the following equation:
5630       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5631    does *NOT* necessarily hold for reduction patterns.  */
5632
5633 bool
5634 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5635                         gimple **vec_stmt, slp_tree slp_node,
5636                         slp_instance slp_node_instance)
5637 {
5638   tree vec_dest;
5639   tree scalar_dest;
5640   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5641   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5642   tree vectype_in = NULL_TREE;
5643   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5644   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5645   enum tree_code code, orig_code;
5646   internal_fn reduc_fn;
5647   machine_mode vec_mode;
5648   int op_type;
5649   optab optab;
5650   tree new_temp = NULL_TREE;
5651   gimple *def_stmt;
5652   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5653   gimple *cond_reduc_def_stmt = NULL;
5654   enum tree_code cond_reduc_op_code = ERROR_MARK;
5655   tree scalar_type;
5656   bool is_simple_use;
5657   gimple *orig_stmt;
5658   stmt_vec_info orig_stmt_info = NULL;
5659   int i;
5660   int ncopies;
5661   int epilog_copies;
5662   stmt_vec_info prev_stmt_info, prev_phi_info;
5663   bool single_defuse_cycle = false;
5664   gimple *new_stmt = NULL;
5665   int j;
5666   tree ops[3];
5667   enum vect_def_type dts[3];
5668   bool nested_cycle = false, found_nested_cycle_def = false;
5669   bool double_reduc = false;
5670   basic_block def_bb;
5671   struct loop * def_stmt_loop, *outer_loop = NULL;
5672   tree def_arg;
5673   gimple *def_arg_stmt;
5674   auto_vec<tree> vec_oprnds0;
5675   auto_vec<tree> vec_oprnds1;
5676   auto_vec<tree> vec_oprnds2;
5677   auto_vec<tree> vect_defs;
5678   auto_vec<gimple *> phis;
5679   int vec_num;
5680   tree def0, tem;
5681   bool first_p = true;
5682   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5683   tree cond_reduc_val = NULL_TREE;
5684
5685   /* Make sure it was already recognized as a reduction computation.  */
5686   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5687       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5688     return false;
5689
5690   if (nested_in_vect_loop_p (loop, stmt))
5691     {
5692       outer_loop = loop;
5693       loop = loop->inner;
5694       nested_cycle = true;
5695     }
5696
5697   /* In case of reduction chain we switch to the first stmt in the chain, but
5698      we don't update STMT_INFO, since only the last stmt is marked as reduction
5699      and has reduction properties.  */
5700   if (GROUP_FIRST_ELEMENT (stmt_info)
5701       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5702     {
5703       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5704       first_p = false;
5705     }
5706
5707   if (gimple_code (stmt) == GIMPLE_PHI)
5708     {
5709       /* Analysis is fully done on the reduction stmt invocation.  */
5710       if (! vec_stmt)
5711         {
5712           if (slp_node)
5713             slp_node_instance->reduc_phis = slp_node;
5714
5715           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5716           return true;
5717         }
5718
5719       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5720       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5721         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5722
5723       gcc_assert (is_gimple_assign (reduc_stmt));
5724       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5725         {
5726           tree op = gimple_op (reduc_stmt, k);
5727           if (op == gimple_phi_result (stmt))
5728             continue;
5729           if (k == 1
5730               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5731             continue;
5732           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5733           if (! vectype_in
5734               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5735             vectype_in = tem;
5736           break;
5737         }
5738       gcc_assert (vectype_in);
5739
5740       if (slp_node)
5741         ncopies = 1;
5742       else
5743         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5744
5745       use_operand_p use_p;
5746       gimple *use_stmt;
5747       if (ncopies > 1
5748           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5749               <= vect_used_only_live)
5750           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5751           && (use_stmt == reduc_stmt
5752               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5753                   == reduc_stmt)))
5754         single_defuse_cycle = true;
5755
5756       /* Create the destination vector  */
5757       scalar_dest = gimple_assign_lhs (reduc_stmt);
5758       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5759
5760       if (slp_node)
5761         /* The size vect_schedule_slp_instance computes is off for us.  */
5762         vec_num = vect_get_num_vectors
5763           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5764            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
5765            vectype_in);
5766       else
5767         vec_num = 1;
5768
5769       /* Generate the reduction PHIs upfront.  */
5770       prev_phi_info = NULL;
5771       for (j = 0; j < ncopies; j++)
5772         {
5773           if (j == 0 || !single_defuse_cycle)
5774             {
5775               for (i = 0; i < vec_num; i++)
5776                 {
5777                   /* Create the reduction-phi that defines the reduction
5778                      operand.  */
5779                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5780                   set_vinfo_for_stmt (new_phi,
5781                                       new_stmt_vec_info (new_phi, loop_vinfo));
5782
5783                   if (slp_node)
5784                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5785                   else
5786                     {
5787                       if (j == 0)
5788                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5789                       else
5790                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5791                       prev_phi_info = vinfo_for_stmt (new_phi);
5792                     }
5793                 }
5794             }
5795         }
5796
5797       return true;
5798     }
5799
5800   /* 1. Is vectorizable reduction?  */
5801   /* Not supportable if the reduction variable is used in the loop, unless
5802      it's a reduction chain.  */
5803   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5804       && !GROUP_FIRST_ELEMENT (stmt_info))
5805     return false;
5806
5807   /* Reductions that are not used even in an enclosing outer-loop,
5808      are expected to be "live" (used out of the loop).  */
5809   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5810       && !STMT_VINFO_LIVE_P (stmt_info))
5811     return false;
5812
5813   /* 2. Has this been recognized as a reduction pattern?
5814
5815      Check if STMT represents a pattern that has been recognized
5816      in earlier analysis stages.  For stmts that represent a pattern,
5817      the STMT_VINFO_RELATED_STMT field records the last stmt in
5818      the original sequence that constitutes the pattern.  */
5819
5820   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5821   if (orig_stmt)
5822     {
5823       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5824       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5825       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5826     }
5827
5828   /* 3. Check the operands of the operation.  The first operands are defined
5829         inside the loop body. The last operand is the reduction variable,
5830         which is defined by the loop-header-phi.  */
5831
5832   gcc_assert (is_gimple_assign (stmt));
5833
5834   /* Flatten RHS.  */
5835   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5836     {
5837     case GIMPLE_BINARY_RHS:
5838       code = gimple_assign_rhs_code (stmt);
5839       op_type = TREE_CODE_LENGTH (code);
5840       gcc_assert (op_type == binary_op);
5841       ops[0] = gimple_assign_rhs1 (stmt);
5842       ops[1] = gimple_assign_rhs2 (stmt);
5843       break;
5844
5845     case GIMPLE_TERNARY_RHS:
5846       code = gimple_assign_rhs_code (stmt);
5847       op_type = TREE_CODE_LENGTH (code);
5848       gcc_assert (op_type == ternary_op);
5849       ops[0] = gimple_assign_rhs1 (stmt);
5850       ops[1] = gimple_assign_rhs2 (stmt);
5851       ops[2] = gimple_assign_rhs3 (stmt);
5852       break;
5853
5854     case GIMPLE_UNARY_RHS:
5855       return false;
5856
5857     default:
5858       gcc_unreachable ();
5859     }
5860
5861   if (code == COND_EXPR && slp_node)
5862     return false;
5863
5864   scalar_dest = gimple_assign_lhs (stmt);
5865   scalar_type = TREE_TYPE (scalar_dest);
5866   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5867       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5868     return false;
5869
5870   /* Do not try to vectorize bit-precision reductions.  */
5871   if (!type_has_mode_precision_p (scalar_type))
5872     return false;
5873
5874   /* All uses but the last are expected to be defined in the loop.
5875      The last use is the reduction variable.  In case of nested cycle this
5876      assumption is not true: we use reduc_index to record the index of the
5877      reduction variable.  */
5878   gimple *reduc_def_stmt = NULL;
5879   int reduc_index = -1;
5880   for (i = 0; i < op_type; i++)
5881     {
5882       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5883       if (i == 0 && code == COND_EXPR)
5884         continue;
5885
5886       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5887                                           &def_stmt, &dts[i], &tem);
5888       dt = dts[i];
5889       gcc_assert (is_simple_use);
5890       if (dt == vect_reduction_def)
5891         {
5892           reduc_def_stmt = def_stmt;
5893           reduc_index = i;
5894           continue;
5895         }
5896       else if (tem)
5897         {
5898           /* To properly compute ncopies we are interested in the widest
5899              input type in case we're looking at a widening accumulation.  */
5900           if (!vectype_in
5901               || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
5902             vectype_in = tem;
5903         }
5904
5905       if (dt != vect_internal_def
5906           && dt != vect_external_def
5907           && dt != vect_constant_def
5908           && dt != vect_induction_def
5909           && !(dt == vect_nested_cycle && nested_cycle))
5910         return false;
5911
5912       if (dt == vect_nested_cycle)
5913         {
5914           found_nested_cycle_def = true;
5915           reduc_def_stmt = def_stmt;
5916           reduc_index = i;
5917         }
5918
5919       if (i == 1 && code == COND_EXPR)
5920         {
5921           /* Record how value of COND_EXPR is defined.  */
5922           if (dt == vect_constant_def)
5923             {
5924               cond_reduc_dt = dt;
5925               cond_reduc_val = ops[i];
5926             }
5927           if (dt == vect_induction_def
5928               && def_stmt != NULL
5929               && is_nonwrapping_integer_induction (def_stmt, loop))
5930             {
5931               cond_reduc_dt = dt;
5932               cond_reduc_def_stmt = def_stmt;
5933             }
5934         }
5935     }
5936
5937   if (!vectype_in)
5938     vectype_in = vectype_out;
5939
5940   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5941      directy used in stmt.  */
5942   if (reduc_index == -1)
5943     {
5944       if (orig_stmt)
5945         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5946       else
5947         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5948     }
5949
5950   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5951     return false;
5952
5953   if (!(reduc_index == -1
5954         || dts[reduc_index] == vect_reduction_def
5955         || dts[reduc_index] == vect_nested_cycle
5956         || ((dts[reduc_index] == vect_internal_def
5957              || dts[reduc_index] == vect_external_def
5958              || dts[reduc_index] == vect_constant_def
5959              || dts[reduc_index] == vect_induction_def)
5960             && nested_cycle && found_nested_cycle_def)))
5961     {
5962       /* For pattern recognized stmts, orig_stmt might be a reduction,
5963          but some helper statements for the pattern might not, or
5964          might be COND_EXPRs with reduction uses in the condition.  */
5965       gcc_assert (orig_stmt);
5966       return false;
5967     }
5968
5969   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5970   enum vect_reduction_type v_reduc_type
5971     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5972   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5973
5974   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5975   /* If we have a condition reduction, see if we can simplify it further.  */
5976   if (v_reduc_type == COND_REDUCTION)
5977     {
5978       if (cond_reduc_dt == vect_induction_def)
5979         {
5980           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
5981           tree base
5982             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
5983           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
5984
5985           gcc_assert (TREE_CODE (base) == INTEGER_CST
5986                       && TREE_CODE (step) == INTEGER_CST);
5987           cond_reduc_val = NULL_TREE;
5988           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5989              above base; punt if base is the minimum value of the type for
5990              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
5991           if (tree_int_cst_sgn (step) == -1)
5992             {
5993               cond_reduc_op_code = MIN_EXPR;
5994               if (tree_int_cst_sgn (base) == -1)
5995                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5996               else if (tree_int_cst_lt (base,
5997                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
5998                 cond_reduc_val
5999                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6000             }
6001           else
6002             {
6003               cond_reduc_op_code = MAX_EXPR;
6004               if (tree_int_cst_sgn (base) == 1)
6005                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6006               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6007                                         base))
6008                 cond_reduc_val
6009                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6010             }
6011           if (cond_reduc_val)
6012             {
6013               if (dump_enabled_p ())
6014                 dump_printf_loc (MSG_NOTE, vect_location,
6015                                  "condition expression based on "
6016                                  "integer induction.\n");
6017               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6018                 = INTEGER_INDUC_COND_REDUCTION;
6019             }
6020         }
6021
6022       /* Loop peeling modifies initial value of reduction PHI, which
6023          makes the reduction stmt to be transformed different to the
6024          original stmt analyzed.  We need to record reduction code for
6025          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6026          it can be used directly at transform stage.  */
6027       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6028           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6029         {
6030           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6031           gcc_assert (cond_reduc_dt == vect_constant_def);
6032           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6033         }
6034       else if (cond_reduc_dt == vect_constant_def)
6035         {
6036           enum vect_def_type cond_initial_dt;
6037           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6038           tree cond_initial_val
6039             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6040
6041           gcc_assert (cond_reduc_val != NULL_TREE);
6042           vect_is_simple_use (cond_initial_val, loop_vinfo,
6043                               &def_stmt, &cond_initial_dt);
6044           if (cond_initial_dt == vect_constant_def
6045               && types_compatible_p (TREE_TYPE (cond_initial_val),
6046                                      TREE_TYPE (cond_reduc_val)))
6047             {
6048               tree e = fold_binary (LE_EXPR, boolean_type_node,
6049                                     cond_initial_val, cond_reduc_val);
6050               if (e && (integer_onep (e) || integer_zerop (e)))
6051                 {
6052                   if (dump_enabled_p ())
6053                     dump_printf_loc (MSG_NOTE, vect_location,
6054                                      "condition expression based on "
6055                                      "compile time constant.\n");
6056                   /* Record reduction code at analysis stage.  */
6057                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6058                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6059                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6060                     = CONST_COND_REDUCTION;
6061                 }
6062             }
6063         }
6064     }
6065
6066   if (orig_stmt)
6067     gcc_assert (tmp == orig_stmt
6068                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6069   else
6070     /* We changed STMT to be the first stmt in reduction chain, hence we
6071        check that in this case the first element in the chain is STMT.  */
6072     gcc_assert (stmt == tmp
6073                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6074
6075   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6076     return false;
6077
6078   if (slp_node)
6079     ncopies = 1;
6080   else
6081     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6082
6083   gcc_assert (ncopies >= 1);
6084
6085   vec_mode = TYPE_MODE (vectype_in);
6086
6087   if (code == COND_EXPR)
6088     {
6089       /* Only call during the analysis stage, otherwise we'll lose
6090          STMT_VINFO_TYPE.  */
6091       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6092                                                 ops[reduc_index], 0, NULL))
6093         {
6094           if (dump_enabled_p ())
6095             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6096                              "unsupported condition in reduction\n");
6097           return false;
6098         }
6099     }
6100   else
6101     {
6102       /* 4. Supportable by target?  */
6103
6104       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6105           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6106         {
6107           /* Shifts and rotates are only supported by vectorizable_shifts,
6108              not vectorizable_reduction.  */
6109           if (dump_enabled_p ())
6110             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6111                              "unsupported shift or rotation.\n");
6112           return false;
6113         }
6114
6115       /* 4.1. check support for the operation in the loop  */
6116       optab = optab_for_tree_code (code, vectype_in, optab_default);
6117       if (!optab)
6118         {
6119           if (dump_enabled_p ())
6120             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6121                              "no optab.\n");
6122
6123           return false;
6124         }
6125
6126       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6127         {
6128           if (dump_enabled_p ())
6129             dump_printf (MSG_NOTE, "op not supported by target.\n");
6130
6131           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6132               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6133             return false;
6134
6135           if (dump_enabled_p ())
6136             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6137         }
6138
6139       /* Worthwhile without SIMD support?  */
6140       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6141           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6142         {
6143           if (dump_enabled_p ())
6144             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6145                              "not worthwhile without SIMD support.\n");
6146
6147           return false;
6148         }
6149     }
6150
6151   /* 4.2. Check support for the epilog operation.
6152
6153           If STMT represents a reduction pattern, then the type of the
6154           reduction variable may be different than the type of the rest
6155           of the arguments.  For example, consider the case of accumulation
6156           of shorts into an int accumulator; The original code:
6157                         S1: int_a = (int) short_a;
6158           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6159
6160           was replaced with:
6161                         STMT: int_acc = widen_sum <short_a, int_acc>
6162
6163           This means that:
6164           1. The tree-code that is used to create the vector operation in the
6165              epilog code (that reduces the partial results) is not the
6166              tree-code of STMT, but is rather the tree-code of the original
6167              stmt from the pattern that STMT is replacing.  I.e, in the example
6168              above we want to use 'widen_sum' in the loop, but 'plus' in the
6169              epilog.
6170           2. The type (mode) we use to check available target support
6171              for the vector operation to be created in the *epilog*, is
6172              determined by the type of the reduction variable (in the example
6173              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6174              However the type (mode) we use to check available target support
6175              for the vector operation to be created *inside the loop*, is
6176              determined by the type of the other arguments to STMT (in the
6177              example we'd check this: optab_handler (widen_sum_optab,
6178              vect_short_mode)).
6179
6180           This is contrary to "regular" reductions, in which the types of all
6181           the arguments are the same as the type of the reduction variable.
6182           For "regular" reductions we can therefore use the same vector type
6183           (and also the same tree-code) when generating the epilog code and
6184           when generating the code inside the loop.  */
6185
6186   if (orig_stmt)
6187     {
6188       /* This is a reduction pattern: get the vectype from the type of the
6189          reduction variable, and get the tree-code from orig_stmt.  */
6190       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6191                   == TREE_CODE_REDUCTION);
6192       orig_code = gimple_assign_rhs_code (orig_stmt);
6193       gcc_assert (vectype_out);
6194       vec_mode = TYPE_MODE (vectype_out);
6195     }
6196   else
6197     {
6198       /* Regular reduction: use the same vectype and tree-code as used for
6199          the vector code inside the loop can be used for the epilog code. */
6200       orig_code = code;
6201
6202       if (code == MINUS_EXPR)
6203         orig_code = PLUS_EXPR;
6204
6205       /* For simple condition reductions, replace with the actual expression
6206          we want to base our reduction around.  */
6207       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6208         {
6209           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6210           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6211         }
6212       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6213                == INTEGER_INDUC_COND_REDUCTION)
6214         orig_code = cond_reduc_op_code;
6215     }
6216
6217   if (nested_cycle)
6218     {
6219       def_bb = gimple_bb (reduc_def_stmt);
6220       def_stmt_loop = def_bb->loop_father;
6221       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6222                                        loop_preheader_edge (def_stmt_loop));
6223       if (TREE_CODE (def_arg) == SSA_NAME
6224           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6225           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6226           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6227           && vinfo_for_stmt (def_arg_stmt)
6228           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6229               == vect_double_reduction_def)
6230         double_reduc = true;
6231     }
6232
6233   reduc_fn = IFN_LAST;
6234
6235   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6236     {
6237       if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6238         {
6239           if (reduc_fn != IFN_LAST
6240               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6241                                                   OPTIMIZE_FOR_SPEED))
6242             {
6243               if (dump_enabled_p ())
6244                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6245                                  "reduc op not supported by target.\n");
6246
6247               reduc_fn = IFN_LAST;
6248             }
6249         }
6250       else
6251         {
6252           if (!nested_cycle || double_reduc)
6253             {
6254               if (dump_enabled_p ())
6255                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6256                                  "no reduc code for scalar code.\n");
6257
6258               return false;
6259             }
6260         }
6261     }
6262   else
6263     {
6264       int scalar_precision
6265         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6266       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6267       cr_index_vector_type = build_vector_type
6268         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6269
6270       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6271                                           OPTIMIZE_FOR_SPEED))
6272         reduc_fn = IFN_REDUC_MAX;
6273     }
6274
6275   if ((double_reduc
6276        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6277       && ncopies > 1)
6278     {
6279       if (dump_enabled_p ())
6280         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6281                          "multiple types in double reduction or condition "
6282                          "reduction.\n");
6283       return false;
6284     }
6285
6286   /* In case of widenning multiplication by a constant, we update the type
6287      of the constant to be the type of the other operand.  We check that the
6288      constant fits the type in the pattern recognition pass.  */
6289   if (code == DOT_PROD_EXPR
6290       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6291     {
6292       if (TREE_CODE (ops[0]) == INTEGER_CST)
6293         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6294       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6295         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6296       else
6297         {
6298           if (dump_enabled_p ())
6299             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6300                              "invalid types in dot-prod\n");
6301
6302           return false;
6303         }
6304     }
6305
6306   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6307     {
6308       widest_int ni;
6309
6310       if (! max_loop_iterations (loop, &ni))
6311         {
6312           if (dump_enabled_p ())
6313             dump_printf_loc (MSG_NOTE, vect_location,
6314                              "loop count not known, cannot create cond "
6315                              "reduction.\n");
6316           return false;
6317         }
6318       /* Convert backedges to iterations.  */
6319       ni += 1;
6320
6321       /* The additional index will be the same type as the condition.  Check
6322          that the loop can fit into this less one (because we'll use up the
6323          zero slot for when there are no matches).  */
6324       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6325       if (wi::geu_p (ni, wi::to_widest (max_index)))
6326         {
6327           if (dump_enabled_p ())
6328             dump_printf_loc (MSG_NOTE, vect_location,
6329                              "loop size is greater than data size.\n");
6330           return false;
6331         }
6332     }
6333
6334   /* In case the vectorization factor (VF) is bigger than the number
6335      of elements that we can fit in a vectype (nunits), we have to generate
6336      more than one vector stmt - i.e - we need to "unroll" the
6337      vector stmt by a factor VF/nunits.  For more details see documentation
6338      in vectorizable_operation.  */
6339
6340   /* If the reduction is used in an outer loop we need to generate
6341      VF intermediate results, like so (e.g. for ncopies=2):
6342         r0 = phi (init, r0)
6343         r1 = phi (init, r1)
6344         r0 = x0 + r0;
6345         r1 = x1 + r1;
6346     (i.e. we generate VF results in 2 registers).
6347     In this case we have a separate def-use cycle for each copy, and therefore
6348     for each copy we get the vector def for the reduction variable from the
6349     respective phi node created for this copy.
6350
6351     Otherwise (the reduction is unused in the loop nest), we can combine
6352     together intermediate results, like so (e.g. for ncopies=2):
6353         r = phi (init, r)
6354         r = x0 + r;
6355         r = x1 + r;
6356    (i.e. we generate VF/2 results in a single register).
6357    In this case for each copy we get the vector def for the reduction variable
6358    from the vectorized reduction operation generated in the previous iteration.
6359
6360    This only works when we see both the reduction PHI and its only consumer
6361    in vectorizable_reduction and there are no intermediate stmts
6362    participating.  */
6363   use_operand_p use_p;
6364   gimple *use_stmt;
6365   if (ncopies > 1
6366       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6367       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6368       && (use_stmt == stmt
6369           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6370     {
6371       single_defuse_cycle = true;
6372       epilog_copies = 1;
6373     }
6374   else
6375     epilog_copies = ncopies;
6376
6377   /* If the reduction stmt is one of the patterns that have lane
6378      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6379   if ((ncopies > 1
6380        && ! single_defuse_cycle)
6381       && (code == DOT_PROD_EXPR
6382           || code == WIDEN_SUM_EXPR
6383           || code == SAD_EXPR))
6384     {
6385       if (dump_enabled_p ())
6386         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6387                          "multi def-use cycle not possible for lane-reducing "
6388                          "reduction operation\n");
6389       return false;
6390     }
6391
6392   if (!vec_stmt) /* transformation not required.  */
6393     {
6394       if (first_p)
6395         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6396       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6397       return true;
6398     }
6399
6400   /* Transform.  */
6401
6402   if (dump_enabled_p ())
6403     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6404
6405   /* FORNOW: Multiple types are not supported for condition.  */
6406   if (code == COND_EXPR)
6407     gcc_assert (ncopies == 1);
6408
6409   /* Create the destination vector  */
6410   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6411
6412   prev_stmt_info = NULL;
6413   prev_phi_info = NULL;
6414   if (slp_node)
6415     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6416   else
6417     {
6418       vec_num = 1;
6419       vec_oprnds0.create (1);
6420       vec_oprnds1.create (1);
6421       if (op_type == ternary_op)
6422         vec_oprnds2.create (1);
6423     }
6424
6425   phis.create (vec_num);
6426   vect_defs.create (vec_num);
6427   if (!slp_node)
6428     vect_defs.quick_push (NULL_TREE);
6429
6430   if (slp_node)
6431     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6432   else
6433     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6434
6435   for (j = 0; j < ncopies; j++)
6436     {
6437       if (code == COND_EXPR)
6438         {
6439           gcc_assert (!slp_node);
6440           vectorizable_condition (stmt, gsi, vec_stmt,
6441                                   PHI_RESULT (phis[0]),
6442                                   reduc_index, NULL);
6443           /* Multiple types are not supported for condition.  */
6444           break;
6445         }
6446
6447       /* Handle uses.  */
6448       if (j == 0)
6449         {
6450           if (slp_node)
6451             {
6452               /* Get vec defs for all the operands except the reduction index,
6453                  ensuring the ordering of the ops in the vector is kept.  */
6454               auto_vec<tree, 3> slp_ops;
6455               auto_vec<vec<tree>, 3> vec_defs;
6456
6457               slp_ops.quick_push (ops[0]);
6458               slp_ops.quick_push (ops[1]);
6459               if (op_type == ternary_op)
6460                 slp_ops.quick_push (ops[2]);
6461
6462               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6463
6464               vec_oprnds0.safe_splice (vec_defs[0]);
6465               vec_defs[0].release ();
6466               vec_oprnds1.safe_splice (vec_defs[1]);
6467               vec_defs[1].release ();
6468               if (op_type == ternary_op)
6469                 {
6470                   vec_oprnds2.safe_splice (vec_defs[2]);
6471                   vec_defs[2].release ();
6472                 }
6473             }
6474           else
6475             {
6476               vec_oprnds0.quick_push
6477                 (vect_get_vec_def_for_operand (ops[0], stmt));
6478               vec_oprnds1.quick_push
6479                 (vect_get_vec_def_for_operand (ops[1], stmt));
6480               if (op_type == ternary_op)
6481                 vec_oprnds2.quick_push
6482                   (vect_get_vec_def_for_operand (ops[2], stmt));
6483             }
6484         }
6485       else
6486         {
6487           if (!slp_node)
6488             {
6489               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6490
6491               if (single_defuse_cycle && reduc_index == 0)
6492                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6493               else
6494                 vec_oprnds0[0]
6495                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6496               if (single_defuse_cycle && reduc_index == 1)
6497                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6498               else
6499                 vec_oprnds1[0]
6500                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6501               if (op_type == ternary_op)
6502                 {
6503                   if (single_defuse_cycle && reduc_index == 2)
6504                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6505                   else
6506                     vec_oprnds2[0]
6507                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6508                 }
6509             }
6510         }
6511
6512       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6513         {
6514           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6515           if (op_type == ternary_op)
6516             vop[2] = vec_oprnds2[i];
6517
6518           new_temp = make_ssa_name (vec_dest, new_stmt);
6519           new_stmt = gimple_build_assign (new_temp, code,
6520                                           vop[0], vop[1], vop[2]);
6521           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6522
6523           if (slp_node)
6524             {
6525               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6526               vect_defs.quick_push (new_temp);
6527             }
6528           else
6529             vect_defs[0] = new_temp;
6530         }
6531
6532       if (slp_node)
6533         continue;
6534
6535       if (j == 0)
6536         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6537       else
6538         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6539
6540       prev_stmt_info = vinfo_for_stmt (new_stmt);
6541     }
6542
6543   /* Finalize the reduction-phi (set its arguments) and create the
6544      epilog reduction code.  */
6545   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6546     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6547
6548   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6549                                     epilog_copies, reduc_fn, phis,
6550                                     double_reduc, slp_node, slp_node_instance,
6551                                     cond_reduc_val, cond_reduc_op_code);
6552
6553   return true;
6554 }
6555
6556 /* Function vect_min_worthwhile_factor.
6557
6558    For a loop where we could vectorize the operation indicated by CODE,
6559    return the minimum vectorization factor that makes it worthwhile
6560    to use generic vectors.  */
6561 static unsigned int
6562 vect_min_worthwhile_factor (enum tree_code code)
6563 {
6564   switch (code)
6565     {
6566     case PLUS_EXPR:
6567     case MINUS_EXPR:
6568     case NEGATE_EXPR:
6569       return 4;
6570
6571     case BIT_AND_EXPR:
6572     case BIT_IOR_EXPR:
6573     case BIT_XOR_EXPR:
6574     case BIT_NOT_EXPR:
6575       return 2;
6576
6577     default:
6578       return INT_MAX;
6579     }
6580 }
6581
6582 /* Return true if VINFO indicates we are doing loop vectorization and if
6583    it is worth decomposing CODE operations into scalar operations for
6584    that loop's vectorization factor.  */
6585
6586 bool
6587 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6588 {
6589   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6590   unsigned HOST_WIDE_INT value;
6591   return (loop_vinfo
6592           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
6593           && value >= vect_min_worthwhile_factor (code));
6594 }
6595
6596 /* Function vectorizable_induction
6597
6598    Check if PHI performs an induction computation that can be vectorized.
6599    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6600    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6601    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6602
6603 bool
6604 vectorizable_induction (gimple *phi,
6605                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6606                         gimple **vec_stmt, slp_tree slp_node)
6607 {
6608   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6609   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6610   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6611   unsigned ncopies;
6612   bool nested_in_vect_loop = false;
6613   struct loop *iv_loop;
6614   tree vec_def;
6615   edge pe = loop_preheader_edge (loop);
6616   basic_block new_bb;
6617   tree new_vec, vec_init, vec_step, t;
6618   tree new_name;
6619   gimple *new_stmt;
6620   gphi *induction_phi;
6621   tree induc_def, vec_dest;
6622   tree init_expr, step_expr;
6623   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6624   unsigned i;
6625   tree expr;
6626   gimple_seq stmts;
6627   imm_use_iterator imm_iter;
6628   use_operand_p use_p;
6629   gimple *exit_phi;
6630   edge latch_e;
6631   tree loop_arg;
6632   gimple_stmt_iterator si;
6633   basic_block bb = gimple_bb (phi);
6634
6635   if (gimple_code (phi) != GIMPLE_PHI)
6636     return false;
6637
6638   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6639     return false;
6640
6641   /* Make sure it was recognized as induction computation.  */
6642   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6643     return false;
6644
6645   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6646   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6647
6648   if (slp_node)
6649     ncopies = 1;
6650   else
6651     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6652   gcc_assert (ncopies >= 1);
6653
6654   /* FORNOW. These restrictions should be relaxed.  */
6655   if (nested_in_vect_loop_p (loop, phi))
6656     {
6657       imm_use_iterator imm_iter;
6658       use_operand_p use_p;
6659       gimple *exit_phi;
6660       edge latch_e;
6661       tree loop_arg;
6662
6663       if (ncopies > 1)
6664         {
6665           if (dump_enabled_p ())
6666             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6667                              "multiple types in nested loop.\n");
6668           return false;
6669         }
6670
6671       /* FORNOW: outer loop induction with SLP not supported.  */
6672       if (STMT_SLP_TYPE (stmt_info))
6673         return false;
6674
6675       exit_phi = NULL;
6676       latch_e = loop_latch_edge (loop->inner);
6677       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6678       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6679         {
6680           gimple *use_stmt = USE_STMT (use_p);
6681           if (is_gimple_debug (use_stmt))
6682             continue;
6683
6684           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6685             {
6686               exit_phi = use_stmt;
6687               break;
6688             }
6689         }
6690       if (exit_phi)
6691         {
6692           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6693           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6694                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6695             {
6696               if (dump_enabled_p ())
6697                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6698                                  "inner-loop induction only used outside "
6699                                  "of the outer vectorized loop.\n");
6700               return false;
6701             }
6702         }
6703
6704       nested_in_vect_loop = true;
6705       iv_loop = loop->inner;
6706     }
6707   else
6708     iv_loop = loop;
6709   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6710
6711   if (!vec_stmt) /* transformation not required.  */
6712     {
6713       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6714       if (dump_enabled_p ())
6715         dump_printf_loc (MSG_NOTE, vect_location,
6716                          "=== vectorizable_induction ===\n");
6717       vect_model_induction_cost (stmt_info, ncopies);
6718       return true;
6719     }
6720
6721   /* Transform.  */
6722
6723   /* Compute a vector variable, initialized with the first VF values of
6724      the induction variable.  E.g., for an iv with IV_PHI='X' and
6725      evolution S, for a vector of 4 units, we want to compute:
6726      [X, X + S, X + 2*S, X + 3*S].  */
6727
6728   if (dump_enabled_p ())
6729     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6730
6731   latch_e = loop_latch_edge (iv_loop);
6732   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6733
6734   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6735   gcc_assert (step_expr != NULL_TREE);
6736
6737   pe = loop_preheader_edge (iv_loop);
6738   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6739                                      loop_preheader_edge (iv_loop));
6740
6741   /* Convert the step to the desired type.  */
6742   stmts = NULL;
6743   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6744   if (stmts)
6745     {
6746       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6747       gcc_assert (!new_bb);
6748     }
6749
6750   /* Find the first insertion point in the BB.  */
6751   si = gsi_after_labels (bb);
6752
6753   /* For SLP induction we have to generate several IVs as for example
6754      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6755      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6756      [VF*S, VF*S, VF*S, VF*S] for all.  */
6757   if (slp_node)
6758     {
6759       /* Convert the init to the desired type.  */
6760       stmts = NULL;
6761       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6762       if (stmts)
6763         {
6764           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6765           gcc_assert (!new_bb);
6766         }
6767
6768       /* Generate [VF*S, VF*S, ... ].  */
6769       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6770         {
6771           expr = build_int_cst (integer_type_node, vf);
6772           expr = fold_convert (TREE_TYPE (step_expr), expr);
6773         }
6774       else
6775         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6776       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6777                               expr, step_expr);
6778       if (! CONSTANT_CLASS_P (new_name))
6779         new_name = vect_init_vector (phi, new_name,
6780                                      TREE_TYPE (step_expr), NULL);
6781       new_vec = build_vector_from_val (vectype, new_name);
6782       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6783
6784       /* Now generate the IVs.  */
6785       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6786       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6787       unsigned elts = nunits * nvects;
6788       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6789       gcc_assert (elts % group_size == 0);
6790       tree elt = init_expr;
6791       unsigned ivn;
6792       for (ivn = 0; ivn < nivs; ++ivn)
6793         {
6794           tree_vector_builder elts (vectype, nunits, 1);
6795           stmts = NULL;
6796           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6797             {
6798               if (ivn*nunits + eltn >= group_size
6799                   && (ivn*nunits + eltn) % group_size == 0)
6800                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6801                                     elt, step_expr);
6802               elts.quick_push (elt);
6803             }
6804           vec_init = gimple_build_vector (&stmts, &elts);
6805           if (stmts)
6806             {
6807               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6808               gcc_assert (!new_bb);
6809             }
6810
6811           /* Create the induction-phi that defines the induction-operand.  */
6812           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6813           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6814           set_vinfo_for_stmt (induction_phi,
6815                               new_stmt_vec_info (induction_phi, loop_vinfo));
6816           induc_def = PHI_RESULT (induction_phi);
6817
6818           /* Create the iv update inside the loop  */
6819           vec_def = make_ssa_name (vec_dest);
6820           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6821           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6822           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6823
6824           /* Set the arguments of the phi node:  */
6825           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6826           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6827                        UNKNOWN_LOCATION);
6828
6829           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6830         }
6831
6832       /* Re-use IVs when we can.  */
6833       if (ivn < nvects)
6834         {
6835           unsigned vfp
6836             = least_common_multiple (group_size, nunits) / group_size;
6837           /* Generate [VF'*S, VF'*S, ... ].  */
6838           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6839             {
6840               expr = build_int_cst (integer_type_node, vfp);
6841               expr = fold_convert (TREE_TYPE (step_expr), expr);
6842             }
6843           else
6844             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6845           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6846                                   expr, step_expr);
6847           if (! CONSTANT_CLASS_P (new_name))
6848             new_name = vect_init_vector (phi, new_name,
6849                                          TREE_TYPE (step_expr), NULL);
6850           new_vec = build_vector_from_val (vectype, new_name);
6851           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6852           for (; ivn < nvects; ++ivn)
6853             {
6854               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6855               tree def;
6856               if (gimple_code (iv) == GIMPLE_PHI)
6857                 def = gimple_phi_result (iv);
6858               else
6859                 def = gimple_assign_lhs (iv);
6860               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6861                                               PLUS_EXPR,
6862                                               def, vec_step);
6863               if (gimple_code (iv) == GIMPLE_PHI)
6864                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6865               else
6866                 {
6867                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6868                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6869                 }
6870               set_vinfo_for_stmt (new_stmt,
6871                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6872               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6873             }
6874         }
6875
6876       return true;
6877     }
6878
6879   /* Create the vector that holds the initial_value of the induction.  */
6880   if (nested_in_vect_loop)
6881     {
6882       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6883          been created during vectorization of previous stmts.  We obtain it
6884          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6885       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6886       /* If the initial value is not of proper type, convert it.  */
6887       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6888         {
6889           new_stmt
6890             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6891                                                           vect_simple_var,
6892                                                           "vec_iv_"),
6893                                    VIEW_CONVERT_EXPR,
6894                                    build1 (VIEW_CONVERT_EXPR, vectype,
6895                                            vec_init));
6896           vec_init = gimple_assign_lhs (new_stmt);
6897           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6898                                                  new_stmt);
6899           gcc_assert (!new_bb);
6900           set_vinfo_for_stmt (new_stmt,
6901                               new_stmt_vec_info (new_stmt, loop_vinfo));
6902         }
6903     }
6904   else
6905     {
6906       /* iv_loop is the loop to be vectorized. Create:
6907          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6908       stmts = NULL;
6909       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6910
6911       tree_vector_builder elts (vectype, nunits, 1);
6912       elts.quick_push (new_name);
6913       for (i = 1; i < nunits; i++)
6914         {
6915           /* Create: new_name_i = new_name + step_expr  */
6916           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6917                                    new_name, step_expr);
6918           elts.quick_push (new_name);
6919         }
6920       /* Create a vector from [new_name_0, new_name_1, ...,
6921          new_name_nunits-1]  */
6922       vec_init = gimple_build_vector (&stmts, &elts);
6923       if (stmts)
6924         {
6925           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6926           gcc_assert (!new_bb);
6927         }
6928     }
6929
6930
6931   /* Create the vector that holds the step of the induction.  */
6932   if (nested_in_vect_loop)
6933     /* iv_loop is nested in the loop to be vectorized. Generate:
6934        vec_step = [S, S, S, S]  */
6935     new_name = step_expr;
6936   else
6937     {
6938       /* iv_loop is the loop to be vectorized. Generate:
6939           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6940       gimple_seq seq = NULL;
6941       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6942         {
6943           expr = build_int_cst (integer_type_node, vf);
6944           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6945         }
6946       else
6947         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6948       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6949                                expr, step_expr);
6950       if (seq)
6951         {
6952           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6953           gcc_assert (!new_bb);
6954         }
6955     }
6956
6957   t = unshare_expr (new_name);
6958   gcc_assert (CONSTANT_CLASS_P (new_name)
6959               || TREE_CODE (new_name) == SSA_NAME);
6960   new_vec = build_vector_from_val (vectype, t);
6961   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6962
6963
6964   /* Create the following def-use cycle:
6965      loop prolog:
6966          vec_init = ...
6967          vec_step = ...
6968      loop:
6969          vec_iv = PHI <vec_init, vec_loop>
6970          ...
6971          STMT
6972          ...
6973          vec_loop = vec_iv + vec_step;  */
6974
6975   /* Create the induction-phi that defines the induction-operand.  */
6976   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6977   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6978   set_vinfo_for_stmt (induction_phi,
6979                       new_stmt_vec_info (induction_phi, loop_vinfo));
6980   induc_def = PHI_RESULT (induction_phi);
6981
6982   /* Create the iv update inside the loop  */
6983   vec_def = make_ssa_name (vec_dest);
6984   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6985   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6986   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6987
6988   /* Set the arguments of the phi node:  */
6989   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6990   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6991                UNKNOWN_LOCATION);
6992
6993   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6994
6995   /* In case that vectorization factor (VF) is bigger than the number
6996      of elements that we can fit in a vectype (nunits), we have to generate
6997      more than one vector stmt - i.e - we need to "unroll" the
6998      vector stmt by a factor VF/nunits.  For more details see documentation
6999      in vectorizable_operation.  */
7000
7001   if (ncopies > 1)
7002     {
7003       gimple_seq seq = NULL;
7004       stmt_vec_info prev_stmt_vinfo;
7005       /* FORNOW. This restriction should be relaxed.  */
7006       gcc_assert (!nested_in_vect_loop);
7007
7008       /* Create the vector that holds the step of the induction.  */
7009       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7010         {
7011           expr = build_int_cst (integer_type_node, nunits);
7012           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7013         }
7014       else
7015         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7016       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7017                                expr, step_expr);
7018       if (seq)
7019         {
7020           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7021           gcc_assert (!new_bb);
7022         }
7023
7024       t = unshare_expr (new_name);
7025       gcc_assert (CONSTANT_CLASS_P (new_name)
7026                   || TREE_CODE (new_name) == SSA_NAME);
7027       new_vec = build_vector_from_val (vectype, t);
7028       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7029
7030       vec_def = induc_def;
7031       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7032       for (i = 1; i < ncopies; i++)
7033         {
7034           /* vec_i = vec_prev + vec_step  */
7035           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7036                                           vec_def, vec_step);
7037           vec_def = make_ssa_name (vec_dest, new_stmt);
7038           gimple_assign_set_lhs (new_stmt, vec_def);
7039
7040           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7041           set_vinfo_for_stmt (new_stmt,
7042                               new_stmt_vec_info (new_stmt, loop_vinfo));
7043           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7044           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7045         }
7046     }
7047
7048   if (nested_in_vect_loop)
7049     {
7050       /* Find the loop-closed exit-phi of the induction, and record
7051          the final vector of induction results:  */
7052       exit_phi = NULL;
7053       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7054         {
7055           gimple *use_stmt = USE_STMT (use_p);
7056           if (is_gimple_debug (use_stmt))
7057             continue;
7058
7059           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7060             {
7061               exit_phi = use_stmt;
7062               break;
7063             }
7064         }
7065       if (exit_phi)
7066         {
7067           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7068           /* FORNOW. Currently not supporting the case that an inner-loop induction
7069              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7070           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7071                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7072
7073           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7074           if (dump_enabled_p ())
7075             {
7076               dump_printf_loc (MSG_NOTE, vect_location,
7077                                "vector of inductions after inner-loop:");
7078               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7079             }
7080         }
7081     }
7082
7083
7084   if (dump_enabled_p ())
7085     {
7086       dump_printf_loc (MSG_NOTE, vect_location,
7087                        "transform induction: created def-use cycle: ");
7088       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7089       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7090                         SSA_NAME_DEF_STMT (vec_def), 0);
7091     }
7092
7093   return true;
7094 }
7095
7096 /* Function vectorizable_live_operation.
7097
7098    STMT computes a value that is used outside the loop.  Check if
7099    it can be supported.  */
7100
7101 bool
7102 vectorizable_live_operation (gimple *stmt,
7103                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7104                              slp_tree slp_node, int slp_index,
7105                              gimple **vec_stmt)
7106 {
7107   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7108   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7109   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7110   imm_use_iterator imm_iter;
7111   tree lhs, lhs_type, bitsize, vec_bitsize;
7112   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7113   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7114   int ncopies;
7115   gimple *use_stmt;
7116   auto_vec<tree> vec_oprnds;
7117
7118   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7119
7120   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7121     return false;
7122
7123   /* FORNOW.  CHECKME.  */
7124   if (nested_in_vect_loop_p (loop, stmt))
7125     return false;
7126
7127   /* If STMT is not relevant and it is a simple assignment and its inputs are
7128      invariant then it can remain in place, unvectorized.  The original last
7129      scalar value that it computes will be used.  */
7130   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7131     {
7132       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7133       if (dump_enabled_p ())
7134         dump_printf_loc (MSG_NOTE, vect_location,
7135                          "statement is simple and uses invariant.  Leaving in "
7136                          "place.\n");
7137       return true;
7138     }
7139
7140   if (slp_node)
7141     ncopies = 1;
7142   else
7143     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7144
7145   if (!vec_stmt)
7146     /* No transformation required.  */
7147     return true;
7148
7149   /* If stmt has a related stmt, then use that for getting the lhs.  */
7150   if (is_pattern_stmt_p (stmt_info))
7151     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7152
7153   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7154         : gimple_get_lhs (stmt);
7155   lhs_type = TREE_TYPE (lhs);
7156
7157   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7158              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7159              : TYPE_SIZE (TREE_TYPE (vectype)));
7160   vec_bitsize = TYPE_SIZE (vectype);
7161
7162   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7163   tree vec_lhs, bitstart;
7164   if (slp_node)
7165     {
7166       gcc_assert (slp_index >= 0);
7167
7168       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7169       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7170
7171       /* Get the last occurrence of the scalar index from the concatenation of
7172          all the slp vectors. Calculate which slp vector it is and the index
7173          within.  */
7174       int pos = (num_vec * nunits) - num_scalar + slp_index;
7175       int vec_entry = pos / nunits;
7176       int vec_index = pos % nunits;
7177
7178       /* Get the correct slp vectorized stmt.  */
7179       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7180
7181       /* Get entry to use.  */
7182       bitstart = bitsize_int (vec_index);
7183       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7184     }
7185   else
7186     {
7187       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7188       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7189
7190       /* For multiple copies, get the last copy.  */
7191       for (int i = 1; i < ncopies; ++i)
7192         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7193                                                   vec_lhs);
7194
7195       /* Get the last lane in the vector.  */
7196       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7197     }
7198
7199   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7200      loop.  */
7201   gimple_seq stmts = NULL;
7202   tree bftype = TREE_TYPE (vectype);
7203   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7204     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7205   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7206   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7207                                    true, NULL_TREE);
7208   if (stmts)
7209     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7210
7211   /* Replace use of lhs with newly computed result.  If the use stmt is a
7212      single arg PHI, just replace all uses of PHI result.  It's necessary
7213      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7214   use_operand_p use_p;
7215   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7216     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7217         && !is_gimple_debug (use_stmt))
7218     {
7219       if (gimple_code (use_stmt) == GIMPLE_PHI
7220           && gimple_phi_num_args (use_stmt) == 1)
7221         {
7222           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7223         }
7224       else
7225         {
7226           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7227             SET_USE (use_p, new_tree);
7228         }
7229       update_stmt (use_stmt);
7230     }
7231
7232   return true;
7233 }
7234
7235 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7236
7237 static void
7238 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7239 {
7240   ssa_op_iter op_iter;
7241   imm_use_iterator imm_iter;
7242   def_operand_p def_p;
7243   gimple *ustmt;
7244
7245   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7246     {
7247       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7248         {
7249           basic_block bb;
7250
7251           if (!is_gimple_debug (ustmt))
7252             continue;
7253
7254           bb = gimple_bb (ustmt);
7255
7256           if (!flow_bb_inside_loop_p (loop, bb))
7257             {
7258               if (gimple_debug_bind_p (ustmt))
7259                 {
7260                   if (dump_enabled_p ())
7261                     dump_printf_loc (MSG_NOTE, vect_location,
7262                                      "killing debug use\n");
7263
7264                   gimple_debug_bind_reset_value (ustmt);
7265                   update_stmt (ustmt);
7266                 }
7267               else
7268                 gcc_unreachable ();
7269             }
7270         }
7271     }
7272 }
7273
7274 /* Given loop represented by LOOP_VINFO, return true if computation of
7275    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7276    otherwise.  */
7277
7278 static bool
7279 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7280 {
7281   /* Constant case.  */
7282   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7283     {
7284       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7285       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7286
7287       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7288       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7289       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7290         return true;
7291     }
7292
7293   widest_int max;
7294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7295   /* Check the upper bound of loop niters.  */
7296   if (get_max_loop_iterations (loop, &max))
7297     {
7298       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7299       signop sgn = TYPE_SIGN (type);
7300       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7301       if (max < type_max)
7302         return true;
7303     }
7304   return false;
7305 }
7306
7307 /* Scale profiling counters by estimation for LOOP which is vectorized
7308    by factor VF.  */
7309
7310 static void
7311 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7312 {
7313   edge preheader = loop_preheader_edge (loop);
7314   /* Reduce loop iterations by the vectorization factor.  */
7315   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7316   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7317
7318   if (freq_h.nonzero_p ())
7319     {
7320       profile_probability p;
7321
7322       /* Avoid dropping loop body profile counter to 0 because of zero count
7323          in loop's preheader.  */
7324       if (!(freq_e == profile_count::zero ()))
7325         freq_e = freq_e.force_nonzero ();
7326       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7327       scale_loop_frequencies (loop, p);
7328     }
7329
7330   edge exit_e = single_exit (loop);
7331   exit_e->probability = profile_probability::always ()
7332                                  .apply_scale (1, new_est_niter + 1);
7333
7334   edge exit_l = single_pred_edge (loop->latch);
7335   profile_probability prob = exit_l->probability;
7336   exit_l->probability = exit_e->probability.invert ();
7337   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7338     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7339 }
7340
7341 /* Function vect_transform_loop.
7342
7343    The analysis phase has determined that the loop is vectorizable.
7344    Vectorize the loop - created vectorized stmts to replace the scalar
7345    stmts in the loop, and update the loop exit condition.
7346    Returns scalar epilogue loop if any.  */
7347
7348 struct loop *
7349 vect_transform_loop (loop_vec_info loop_vinfo)
7350 {
7351   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7352   struct loop *epilogue = NULL;
7353   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7354   int nbbs = loop->num_nodes;
7355   int i;
7356   tree niters_vector = NULL_TREE;
7357   tree step_vector = NULL_TREE;
7358   tree niters_vector_mult_vf = NULL_TREE;
7359   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7360   unsigned int lowest_vf = constant_lower_bound (vf);
7361   bool grouped_store;
7362   bool slp_scheduled = false;
7363   gimple *stmt, *pattern_stmt;
7364   gimple_seq pattern_def_seq = NULL;
7365   gimple_stmt_iterator pattern_def_si = gsi_none ();
7366   bool transform_pattern_stmt = false;
7367   bool check_profitability = false;
7368   unsigned int th;
7369
7370   if (dump_enabled_p ())
7371     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7372
7373   /* Use the more conservative vectorization threshold.  If the number
7374      of iterations is constant assume the cost check has been performed
7375      by our caller.  If the threshold makes all loops profitable that
7376      run at least the (estimated) vectorization factor number of times
7377      checking is pointless, too.  */
7378   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7379   if (th >= vect_vf_for_cost (loop_vinfo)
7380       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7381     {
7382       if (dump_enabled_p ())
7383         dump_printf_loc (MSG_NOTE, vect_location,
7384                          "Profitability threshold is %d loop iterations.\n",
7385                          th);
7386       check_profitability = true;
7387     }
7388
7389   /* Make sure there exists a single-predecessor exit bb.  Do this before
7390      versioning.   */
7391   edge e = single_exit (loop);
7392   if (! single_pred_p (e->dest))
7393     {
7394       split_loop_exit_edge (e);
7395       if (dump_enabled_p ())
7396         dump_printf (MSG_NOTE, "split exit edge\n");
7397     }
7398
7399   /* Version the loop first, if required, so the profitability check
7400      comes first.  */
7401
7402   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7403     {
7404       poly_uint64 versioning_threshold
7405         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
7406       if (check_profitability
7407           && ordered_p (poly_uint64 (th), versioning_threshold))
7408         {
7409           versioning_threshold = ordered_max (poly_uint64 (th),
7410                                               versioning_threshold);
7411           check_profitability = false;
7412         }
7413       vect_loop_versioning (loop_vinfo, th, check_profitability,
7414                             versioning_threshold);
7415       check_profitability = false;
7416     }
7417
7418   /* Make sure there exists a single-predecessor exit bb also on the
7419      scalar loop copy.  Do this after versioning but before peeling
7420      so CFG structure is fine for both scalar and if-converted loop
7421      to make slpeel_duplicate_current_defs_from_edges face matched
7422      loop closed PHI nodes on the exit.  */
7423   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7424     {
7425       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7426       if (! single_pred_p (e->dest))
7427         {
7428           split_loop_exit_edge (e);
7429           if (dump_enabled_p ())
7430             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7431         }
7432     }
7433
7434   tree niters = vect_build_loop_niters (loop_vinfo);
7435   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7436   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7437   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7438   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
7439                               &step_vector, &niters_vector_mult_vf, th,
7440                               check_profitability, niters_no_overflow);
7441   if (niters_vector == NULL_TREE)
7442     {
7443       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && known_eq (lowest_vf, vf))
7444         {
7445           niters_vector
7446             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7447                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
7448           step_vector = build_one_cst (TREE_TYPE (niters));
7449         }
7450       else
7451         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7452                                      &step_vector, niters_no_overflow);
7453     }
7454
7455   /* 1) Make sure the loop header has exactly two entries
7456      2) Make sure we have a preheader basic block.  */
7457
7458   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7459
7460   split_edge (loop_preheader_edge (loop));
7461
7462   /* FORNOW: the vectorizer supports only loops which body consist
7463      of one basic block (header + empty latch). When the vectorizer will
7464      support more involved loop forms, the order by which the BBs are
7465      traversed need to be reconsidered.  */
7466
7467   for (i = 0; i < nbbs; i++)
7468     {
7469       basic_block bb = bbs[i];
7470       stmt_vec_info stmt_info;
7471
7472       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7473            gsi_next (&si))
7474         {
7475           gphi *phi = si.phi ();
7476           if (dump_enabled_p ())
7477             {
7478               dump_printf_loc (MSG_NOTE, vect_location,
7479                                "------>vectorizing phi: ");
7480               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7481             }
7482           stmt_info = vinfo_for_stmt (phi);
7483           if (!stmt_info)
7484             continue;
7485
7486           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7487             vect_loop_kill_debug_uses (loop, phi);
7488
7489           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7490               && !STMT_VINFO_LIVE_P (stmt_info))
7491             continue;
7492
7493           if (STMT_VINFO_VECTYPE (stmt_info)
7494               && (maybe_ne
7495                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
7496               && dump_enabled_p ())
7497             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7498
7499           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7500                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7501                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7502               && ! PURE_SLP_STMT (stmt_info))
7503             {
7504               if (dump_enabled_p ())
7505                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7506               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7507             }
7508         }
7509
7510       pattern_stmt = NULL;
7511       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7512            !gsi_end_p (si) || transform_pattern_stmt;)
7513         {
7514           bool is_store;
7515
7516           if (transform_pattern_stmt)
7517             stmt = pattern_stmt;
7518           else
7519             {
7520               stmt = gsi_stmt (si);
7521               /* During vectorization remove existing clobber stmts.  */
7522               if (gimple_clobber_p (stmt))
7523                 {
7524                   unlink_stmt_vdef (stmt);
7525                   gsi_remove (&si, true);
7526                   release_defs (stmt);
7527                   continue;
7528                 }
7529             }
7530
7531           if (dump_enabled_p ())
7532             {
7533               dump_printf_loc (MSG_NOTE, vect_location,
7534                                "------>vectorizing statement: ");
7535               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7536             }
7537
7538           stmt_info = vinfo_for_stmt (stmt);
7539
7540           /* vector stmts created in the outer-loop during vectorization of
7541              stmts in an inner-loop may not have a stmt_info, and do not
7542              need to be vectorized.  */
7543           if (!stmt_info)
7544             {
7545               gsi_next (&si);
7546               continue;
7547             }
7548
7549           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7550             vect_loop_kill_debug_uses (loop, stmt);
7551
7552           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7553               && !STMT_VINFO_LIVE_P (stmt_info))
7554             {
7555               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7556                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7557                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7558                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7559                 {
7560                   stmt = pattern_stmt;
7561                   stmt_info = vinfo_for_stmt (stmt);
7562                 }
7563               else
7564                 {
7565                   gsi_next (&si);
7566                   continue;
7567                 }
7568             }
7569           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7570                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7571                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7572                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7573             transform_pattern_stmt = true;
7574
7575           /* If pattern statement has def stmts, vectorize them too.  */
7576           if (is_pattern_stmt_p (stmt_info))
7577             {
7578               if (pattern_def_seq == NULL)
7579                 {
7580                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7581                   pattern_def_si = gsi_start (pattern_def_seq);
7582                 }
7583               else if (!gsi_end_p (pattern_def_si))
7584                 gsi_next (&pattern_def_si);
7585               if (pattern_def_seq != NULL)
7586                 {
7587                   gimple *pattern_def_stmt = NULL;
7588                   stmt_vec_info pattern_def_stmt_info = NULL;
7589
7590                   while (!gsi_end_p (pattern_def_si))
7591                     {
7592                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7593                       pattern_def_stmt_info
7594                         = vinfo_for_stmt (pattern_def_stmt);
7595                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7596                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7597                         break;
7598                       gsi_next (&pattern_def_si);
7599                     }
7600
7601                   if (!gsi_end_p (pattern_def_si))
7602                     {
7603                       if (dump_enabled_p ())
7604                         {
7605                           dump_printf_loc (MSG_NOTE, vect_location,
7606                                            "==> vectorizing pattern def "
7607                                            "stmt: ");
7608                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7609                                             pattern_def_stmt, 0);
7610                         }
7611
7612                       stmt = pattern_def_stmt;
7613                       stmt_info = pattern_def_stmt_info;
7614                     }
7615                   else
7616                     {
7617                       pattern_def_si = gsi_none ();
7618                       transform_pattern_stmt = false;
7619                     }
7620                 }
7621               else
7622                 transform_pattern_stmt = false;
7623             }
7624
7625           if (STMT_VINFO_VECTYPE (stmt_info))
7626             {
7627               unsigned int nunits
7628                 = (unsigned int)
7629                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7630               if (!STMT_SLP_TYPE (stmt_info)
7631                   && maybe_ne (nunits, vf)
7632                   && dump_enabled_p ())
7633                   /* For SLP VF is set according to unrolling factor, and not
7634                      to vector size, hence for SLP this print is not valid.  */
7635                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7636             }
7637
7638           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7639              reached.  */
7640           if (STMT_SLP_TYPE (stmt_info))
7641             {
7642               if (!slp_scheduled)
7643                 {
7644                   slp_scheduled = true;
7645
7646                   if (dump_enabled_p ())
7647                     dump_printf_loc (MSG_NOTE, vect_location,
7648                                      "=== scheduling SLP instances ===\n");
7649
7650                   vect_schedule_slp (loop_vinfo);
7651                 }
7652
7653               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7654               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7655                 {
7656                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7657                     {
7658                       pattern_def_seq = NULL;
7659                       gsi_next (&si);
7660                     }
7661                   continue;
7662                 }
7663             }
7664
7665           /* -------- vectorize statement ------------ */
7666           if (dump_enabled_p ())
7667             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7668
7669           grouped_store = false;
7670           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7671           if (is_store)
7672             {
7673               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7674                 {
7675                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7676                      interleaving chain was completed - free all the stores in
7677                      the chain.  */
7678                   gsi_next (&si);
7679                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7680                 }
7681               else
7682                 {
7683                   /* Free the attached stmt_vec_info and remove the stmt.  */
7684                   gimple *store = gsi_stmt (si);
7685                   free_stmt_vec_info (store);
7686                   unlink_stmt_vdef (store);
7687                   gsi_remove (&si, true);
7688                   release_defs (store);
7689                 }
7690
7691               /* Stores can only appear at the end of pattern statements.  */
7692               gcc_assert (!transform_pattern_stmt);
7693               pattern_def_seq = NULL;
7694             }
7695           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7696             {
7697               pattern_def_seq = NULL;
7698               gsi_next (&si);
7699             }
7700         }                       /* stmts in BB */
7701     }                           /* BBs in loop */
7702
7703   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
7704      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
7705   if (integer_onep (step_vector))
7706     niters_no_overflow = true;
7707   slpeel_make_loop_iterate_ntimes (loop, niters_vector, step_vector,
7708                                    niters_vector_mult_vf,
7709                                    !niters_no_overflow);
7710
7711   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
7712   scale_profile_for_vect_loop (loop, assumed_vf);
7713
7714   /* The minimum number of iterations performed by the epilogue.  This
7715      is 1 when peeling for gaps because we always need a final scalar
7716      iteration.  */
7717   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7718   /* +1 to convert latch counts to loop iteration counts,
7719      -min_epilogue_iters to remove iterations that cannot be performed
7720        by the vector code.  */
7721   int bias = 1 - min_epilogue_iters;
7722   /* In these calculations the "- 1" converts loop iteration counts
7723      back to latch counts.  */
7724   if (loop->any_upper_bound)
7725     loop->nb_iterations_upper_bound
7726       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
7727                         lowest_vf) - 1;
7728   if (loop->any_likely_upper_bound)
7729     loop->nb_iterations_likely_upper_bound
7730       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
7731                         lowest_vf) - 1;
7732   if (loop->any_estimate)
7733     loop->nb_iterations_estimate
7734       = wi::udiv_floor (loop->nb_iterations_estimate + bias,
7735                         assumed_vf) - 1;
7736
7737   if (dump_enabled_p ())
7738     {
7739       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7740         {
7741           dump_printf_loc (MSG_NOTE, vect_location,
7742                            "LOOP VECTORIZED\n");
7743           if (loop->inner)
7744             dump_printf_loc (MSG_NOTE, vect_location,
7745                              "OUTER LOOP VECTORIZED\n");
7746           dump_printf (MSG_NOTE, "\n");
7747         }
7748       else
7749         dump_printf_loc (MSG_NOTE, vect_location,
7750                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7751                          current_vector_size);
7752     }
7753
7754   /* Free SLP instances here because otherwise stmt reference counting
7755      won't work.  */
7756   slp_instance instance;
7757   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7758     vect_free_slp_instance (instance);
7759   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7760   /* Clear-up safelen field since its value is invalid after vectorization
7761      since vectorized loop can have loop-carried dependencies.  */
7762   loop->safelen = 0;
7763
7764   /* Don't vectorize epilogue for epilogue.  */
7765   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7766     epilogue = NULL;
7767
7768   if (epilogue)
7769     {
7770         unsigned int vector_sizes
7771           = targetm.vectorize.autovectorize_vector_sizes ();
7772         vector_sizes &= current_vector_size - 1;
7773
7774         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7775           epilogue = NULL;
7776         else if (!vector_sizes)
7777           epilogue = NULL;
7778         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7779                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
7780                  && known_eq (vf, lowest_vf))
7781           {
7782             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7783             int ratio = current_vector_size / smallest_vec_size;
7784             unsigned HOST_WIDE_INT eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7785               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7786             eiters = eiters % lowest_vf;
7787
7788             epilogue->nb_iterations_upper_bound = eiters - 1;
7789
7790             if (eiters < lowest_vf / ratio)
7791               epilogue = NULL;
7792             }
7793     }
7794
7795   if (epilogue)
7796     {
7797       epilogue->force_vectorize = loop->force_vectorize;
7798       epilogue->safelen = loop->safelen;
7799       epilogue->dont_vectorize = false;
7800
7801       /* We may need to if-convert epilogue to vectorize it.  */
7802       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7803         tree_if_conversion (epilogue);
7804     }
7805
7806   return epilogue;
7807 }
7808
7809 /* The code below is trying to perform simple optimization - revert
7810    if-conversion for masked stores, i.e. if the mask of a store is zero
7811    do not perform it and all stored value producers also if possible.
7812    For example,
7813      for (i=0; i<n; i++)
7814        if (c[i])
7815         {
7816           p1[i] += 1;
7817           p2[i] = p3[i] +2;
7818         }
7819    this transformation will produce the following semi-hammock:
7820
7821    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7822      {
7823        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7824        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7825        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7826        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7827        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7828        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7829      }
7830 */
7831
7832 void
7833 optimize_mask_stores (struct loop *loop)
7834 {
7835   basic_block *bbs = get_loop_body (loop);
7836   unsigned nbbs = loop->num_nodes;
7837   unsigned i;
7838   basic_block bb;
7839   struct loop *bb_loop;
7840   gimple_stmt_iterator gsi;
7841   gimple *stmt;
7842   auto_vec<gimple *> worklist;
7843
7844   vect_location = find_loop_location (loop);
7845   /* Pick up all masked stores in loop if any.  */
7846   for (i = 0; i < nbbs; i++)
7847     {
7848       bb = bbs[i];
7849       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7850            gsi_next (&gsi))
7851         {
7852           stmt = gsi_stmt (gsi);
7853           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7854             worklist.safe_push (stmt);
7855         }
7856     }
7857
7858   free (bbs);
7859   if (worklist.is_empty ())
7860     return;
7861
7862   /* Loop has masked stores.  */
7863   while (!worklist.is_empty ())
7864     {
7865       gimple *last, *last_store;
7866       edge e, efalse;
7867       tree mask;
7868       basic_block store_bb, join_bb;
7869       gimple_stmt_iterator gsi_to;
7870       tree vdef, new_vdef;
7871       gphi *phi;
7872       tree vectype;
7873       tree zero;
7874
7875       last = worklist.pop ();
7876       mask = gimple_call_arg (last, 2);
7877       bb = gimple_bb (last);
7878       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7879          the same loop as if_bb.  It could be different to LOOP when two
7880          level loop-nest is vectorized and mask_store belongs to the inner
7881          one.  */
7882       e = split_block (bb, last);
7883       bb_loop = bb->loop_father;
7884       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7885       join_bb = e->dest;
7886       store_bb = create_empty_bb (bb);
7887       add_bb_to_loop (store_bb, bb_loop);
7888       e->flags = EDGE_TRUE_VALUE;
7889       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7890       /* Put STORE_BB to likely part.  */
7891       efalse->probability = profile_probability::unlikely ();
7892       store_bb->count = efalse->count ();
7893       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7894       if (dom_info_available_p (CDI_DOMINATORS))
7895         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7896       if (dump_enabled_p ())
7897         dump_printf_loc (MSG_NOTE, vect_location,
7898                          "Create new block %d to sink mask stores.",
7899                          store_bb->index);
7900       /* Create vector comparison with boolean result.  */
7901       vectype = TREE_TYPE (mask);
7902       zero = build_zero_cst (vectype);
7903       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7904       gsi = gsi_last_bb (bb);
7905       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7906       /* Create new PHI node for vdef of the last masked store:
7907          .MEM_2 = VDEF <.MEM_1>
7908          will be converted to
7909          .MEM.3 = VDEF <.MEM_1>
7910          and new PHI node will be created in join bb
7911          .MEM_2 = PHI <.MEM_1, .MEM_3>
7912       */
7913       vdef = gimple_vdef (last);
7914       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7915       gimple_set_vdef (last, new_vdef);
7916       phi = create_phi_node (vdef, join_bb);
7917       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7918
7919       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7920       while (true)
7921         {
7922           gimple_stmt_iterator gsi_from;
7923           gimple *stmt1 = NULL;
7924
7925           /* Move masked store to STORE_BB.  */
7926           last_store = last;
7927           gsi = gsi_for_stmt (last);
7928           gsi_from = gsi;
7929           /* Shift GSI to the previous stmt for further traversal.  */
7930           gsi_prev (&gsi);
7931           gsi_to = gsi_start_bb (store_bb);
7932           gsi_move_before (&gsi_from, &gsi_to);
7933           /* Setup GSI_TO to the non-empty block start.  */
7934           gsi_to = gsi_start_bb (store_bb);
7935           if (dump_enabled_p ())
7936             {
7937               dump_printf_loc (MSG_NOTE, vect_location,
7938                                "Move stmt to created bb\n");
7939               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7940             }
7941           /* Move all stored value producers if possible.  */
7942           while (!gsi_end_p (gsi))
7943             {
7944               tree lhs;
7945               imm_use_iterator imm_iter;
7946               use_operand_p use_p;
7947               bool res;
7948
7949               /* Skip debug statements.  */
7950               if (is_gimple_debug (gsi_stmt (gsi)))
7951                 {
7952                   gsi_prev (&gsi);
7953                   continue;
7954                 }
7955               stmt1 = gsi_stmt (gsi);
7956               /* Do not consider statements writing to memory or having
7957                  volatile operand.  */
7958               if (gimple_vdef (stmt1)
7959                   || gimple_has_volatile_ops (stmt1))
7960                 break;
7961               gsi_from = gsi;
7962               gsi_prev (&gsi);
7963               lhs = gimple_get_lhs (stmt1);
7964               if (!lhs)
7965                 break;
7966
7967               /* LHS of vectorized stmt must be SSA_NAME.  */
7968               if (TREE_CODE (lhs) != SSA_NAME)
7969                 break;
7970
7971               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7972                 {
7973                   /* Remove dead scalar statement.  */
7974                   if (has_zero_uses (lhs))
7975                     {
7976                       gsi_remove (&gsi_from, true);
7977                       continue;
7978                     }
7979                 }
7980
7981               /* Check that LHS does not have uses outside of STORE_BB.  */
7982               res = true;
7983               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7984                 {
7985                   gimple *use_stmt;
7986                   use_stmt = USE_STMT (use_p);
7987                   if (is_gimple_debug (use_stmt))
7988                     continue;
7989                   if (gimple_bb (use_stmt) != store_bb)
7990                     {
7991                       res = false;
7992                       break;
7993                     }
7994                 }
7995               if (!res)
7996                 break;
7997
7998               if (gimple_vuse (stmt1)
7999                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8000                 break;
8001
8002               /* Can move STMT1 to STORE_BB.  */
8003               if (dump_enabled_p ())
8004                 {
8005                   dump_printf_loc (MSG_NOTE, vect_location,
8006                                    "Move stmt to created bb\n");
8007                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8008                 }
8009               gsi_move_before (&gsi_from, &gsi_to);
8010               /* Shift GSI_TO for further insertion.  */
8011               gsi_prev (&gsi_to);
8012             }
8013           /* Put other masked stores with the same mask to STORE_BB.  */
8014           if (worklist.is_empty ()
8015               || gimple_call_arg (worklist.last (), 2) != mask
8016               || worklist.last () != stmt1)
8017             break;
8018           last = worklist.pop ();
8019         }
8020       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8021     }
8022 }